Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

JSON corpus interface #1584

Merged
merged 26 commits into from
Jun 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
e41db24
add corpus definitions link
lukavdplas May 15, 2024
74b1235
generate definitions overview component
lukavdplas May 15, 2024
9540010
basic interface for definitions overview
lukavdplas May 15, 2024
cc9521b
add APICorpusDefinition interface
lukavdplas May 15, 2024
ce9206c
fetch definitions in overview
lukavdplas May 15, 2024
9ac7a07
draft create-definition component
lukavdplas May 16, 2024
d2a9f3f
draft edit corpus component
lukavdplas May 16, 2024
63ca156
add breadcrumbs
lukavdplas May 16, 2024
c9a6340
use router state in edit definition component
lukavdplas May 16, 2024
9855b77
include search link in overview
lukavdplas May 16, 2024
d623079
update tests
lukavdplas May 16, 2024
0f967cf
update API for corpus definitions
lukavdplas May 16, 2024
55d6b6f
update frontend to new API
lukavdplas May 16, 2024
7ee40ef
implement JSON download
lukavdplas May 16, 2024
c86c717
draft json upload
lukavdplas May 16, 2024
23d052d
show current state in edit component
lukavdplas May 16, 2024
3359a62
extract json upload component
lukavdplas May 16, 2024
a8a1517
draft CorpusDefinition model
lukavdplas May 17, 2024
41d5fac
simplify update interface
lukavdplas May 17, 2024
1a789cc
create definition form
lukavdplas May 17, 2024
ecd44d3
enable deleting corpora
lukavdplas May 17, 2024
76669ad
basic feedback for saving data
lukavdplas May 22, 2024
35fea96
update documentation
lukavdplas May 22, 2024
db611d2
clear corpus data in tests
lukavdplas May 22, 2024
7f9555c
fix property reference
lukavdplas May 22, 2024
968ac83
Merge branch 'develop' into feature/json-corpus-interface
JeltevanBoheemen Jun 20, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion backend/addcorpus/json_corpora/export_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

def export_json_corpus(corpus: Corpus) -> Dict:
config = corpus.configuration
data = {'name': corpus.name, 'id': corpus.pk }
data = {'name': corpus.name}
data['meta'] = export_corpus_meta(config)
data['source_data'] = export_corpus_source_data(config)
options = export_corpus_options(config)
Expand Down
5 changes: 2 additions & 3 deletions backend/addcorpus/json_corpora/tests/test_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,9 @@
from addcorpus.models import Corpus, Field
from addcorpus.json_corpora.import_json import _parse_field

def test_corpus_export(json_mock_corpus: Corpus, json_corpus_data):
def test_corpus_export(json_mock_corpus: Corpus, json_corpus_definition):
result = export_json_corpus(json_mock_corpus)
result.pop('id')
assert result == json_corpus_data
assert result == json_corpus_definition

def test_field_export(any_field_json):
imported = _parse_field(any_field_json)
Expand Down
39 changes: 26 additions & 13 deletions backend/addcorpus/json_corpora/tests/test_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,15 @@
from addcorpus.serializers import CorpusJSONDefinitionSerializer
from addcorpus.models import Corpus, CorpusConfiguration

def test_json_corpus_import(db, json_corpus_data):
Corpus.objects.all().delete()
def test_json_corpus_import(db, json_mock_corpus, json_corpus_definition):
json_mock_corpus.delete()

serializer = CorpusJSONDefinitionSerializer(data=json_corpus_data)
data = {
'definition': json_corpus_definition,
'active': True,
}

serializer = CorpusJSONDefinitionSerializer(data=data)
assert serializer.is_valid()
corpus = serializer.create(serializer.validated_data)

Expand Down Expand Up @@ -35,30 +40,38 @@ def test_json_corpus_import(db, json_corpus_data):
assert line_field.display_type == 'text_content'


def test_serializer_representation(db, json_corpus_data):
Corpus.objects.all().delete()
def test_serializer_representation(db, json_mock_corpus, json_corpus_definition):
json_mock_corpus.delete()

data = {
'definition': json_corpus_definition,
'active': True,
}

serializer = CorpusJSONDefinitionSerializer(data=json_corpus_data)
serializer = CorpusJSONDefinitionSerializer(data=data)
assert serializer.is_valid()
corpus = serializer.create(serializer.validated_data)

serialized = serializer.to_representation(corpus)
serialized.pop('id')
assert json_corpus_data == serialized
assert json_corpus_definition == serialized['definition']

def test_serializer_update(db, json_corpus_data, json_mock_corpus: Corpus):
def test_serializer_update(db, json_corpus_definition, json_mock_corpus: Corpus):
# edit description
json_corpus_data['meta']['description'] = 'A different description'
serializer = CorpusJSONDefinitionSerializer(data=json_corpus_data)
data = {
'definition': json_corpus_definition,
'active': True,
}
data['definition']['meta']['description'] = 'A different description'
serializer = CorpusJSONDefinitionSerializer(data=data)
assert serializer.is_valid()
serializer.update(json_mock_corpus, serializer.validated_data)
corpus_config = CorpusConfiguration.objects.get(corpus=json_mock_corpus)
assert corpus_config.description == 'A different description'

# remove a field
assert Field.objects.filter(corpus_configuration__corpus=json_mock_corpus).count() == 2
json_corpus_data['fields'] = json_corpus_data['fields'][:-1]
serializer = CorpusJSONDefinitionSerializer(data=json_corpus_data)
data['definition']['fields'] = data['definition']['fields'][:-1]
serializer = CorpusJSONDefinitionSerializer(data=data)
assert serializer.is_valid()
serializer.update(json_mock_corpus, serializer.validated_data)
assert Field.objects.filter(corpus_configuration__corpus=json_mock_corpus).count() == 1
Expand Down
8 changes: 4 additions & 4 deletions backend/addcorpus/json_corpora/tests/test_validate.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
from addcorpus.json_corpora.validate import validate


def test_validate(json_corpus_data):
validate(json_corpus_data)
def test_validate(json_corpus_definition):
validate(json_corpus_definition)


def test_validate_subschema(json_corpus_data):
source_data = json_corpus_data['source_data']
def test_validate_subschema(json_corpus_definition):
source_data = json_corpus_definition['source_data']
validate(source_data, 'properties', 'source_data')
40 changes: 29 additions & 11 deletions backend/addcorpus/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,33 +128,47 @@ class Meta:
fields = ['corpus_configuration', 'type', 'content']


class CorpusJSONDefinitionSerializer(serializers.ModelSerializer):
class Meta:
model = Corpus
fields = '__all__'
class JSONDefinitionField(serializers.Field):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nice!

def get_attribute(self, instance: Corpus):
return instance

def to_representation(self, instance) -> Dict:
return export_json_corpus(instance)
def to_representation(self, value: Corpus) -> Dict:
return export_json_corpus(value)

def to_internal_value(self, data) -> Dict:
def to_internal_value(self, data: Dict) -> Dict:
return import_json_corpus(data)


class CorpusJSONDefinitionSerializer(serializers.ModelSerializer):
definition = JSONDefinitionField()

class Meta:
model = Corpus
fields = ['id', 'active', 'definition']
read_only_fields = ['id']

def create(self, validated_data: Dict):
configuration_data = validated_data.pop('configuration')
definition_data = validated_data.get('definition')
configuration_data = definition_data.pop('configuration')
fields_data = configuration_data.pop('fields')

corpus = Corpus.objects.create(**validated_data)
corpus = Corpus.objects.create(**definition_data)
configuration = CorpusConfiguration.objects.create(corpus=corpus, **configuration_data)
for field_data in fields_data:
Field.objects.create(corpus_configuration=configuration, **field_data)

if validated_data.get('active') == True:
corpus.active = True
corpus.save()

return corpus

def update(self, instance: Corpus, validated_data: Dict):
configuration_data = validated_data.pop('configuration')
definition_data = validated_data.get('definition')
configuration_data = definition_data.pop('configuration')
fields_data = configuration_data.pop('fields')

corpus = Corpus(pk=instance.pk, **validated_data)
corpus = Corpus(pk=instance.pk, **definition_data)
corpus.save()

configuration, _ = CorpusConfiguration.objects.get_or_create(corpus=corpus)
Expand All @@ -172,4 +186,8 @@ def update(self, instance: Corpus, validated_data: Dict):

configuration.fields.exclude(name__in=(f['name'] for f in fields_data)).delete()

if validated_data.get('active') == True:
corpus.active = True
corpus.save()

return corpus
5 changes: 3 additions & 2 deletions backend/addcorpus/tests/test_corpus_views.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from addcorpus.python_corpora.save_corpus import load_and_save_all_corpora

def test_no_corpora(db, settings, admin_client):
Corpus.objects.all().delete()
settings.CORPORA = {}
load_and_save_all_corpora()

Expand Down Expand Up @@ -84,7 +85,7 @@ def test_corpus_not_publication_ready(admin_client, basic_mock_corpus):
response = admin_client.get('/api/corpus/')
corpus = not any(c['name'] == basic_mock_corpus for c in response.data)

def test_corpus_edit_views(admin_client: Client, json_corpus_data: Dict, json_mock_corpus: Corpus):
def test_corpus_edit_views(admin_client: Client, json_corpus_definition: Dict, json_mock_corpus: Corpus):
json_mock_corpus.delete()

response = admin_client.get('/api/corpus/definitions/')
Expand All @@ -93,7 +94,7 @@ def test_corpus_edit_views(admin_client: Client, json_corpus_data: Dict, json_mo

response = admin_client.post(
'/api/corpus/definitions/',
json_corpus_data,
{'definition': json_corpus_definition, 'active': True},
content_type='application/json',
)
assert status.is_success(response.status_code)
Expand Down
10 changes: 7 additions & 3 deletions backend/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,16 +197,20 @@ def add_mock_python_corpora_to_db(db, media_dir):


@pytest.fixture()
def json_corpus_data():
def json_corpus_definition():
path = os.path.join(settings.BASE_DIR, 'corpora_test', 'basic', 'mock_corpus.json')
with open(path) as f:
return json.load(f)


@pytest.fixture(autouse=True)
def json_mock_corpus(db, json_corpus_data) -> Corpus:
def json_mock_corpus(db, json_corpus_definition) -> Corpus:
# add json mock corpora to the database at the start of each test
serializer = CorpusJSONDefinitionSerializer(data=json_corpus_data)
data = {
'definition': json_corpus_definition,
'active': True,
}
serializer = CorpusJSONDefinitionSerializer(data=data)
assert serializer.is_valid()
corpus = serializer.create(serializer.validated_data)

Expand Down
2 changes: 2 additions & 0 deletions backend/corpora/utils_test.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from addcorpus.python_corpora.save_corpus import load_and_save_all_corpora
from addcorpus.models import Corpus

def corpus_from_api(client):
'''
Expand All @@ -11,6 +12,7 @@ def corpus_from_api(client):
useful when you have configured your settings with only one corpus.
'''

Corpus.objects.all().delete()
load_and_save_all_corpora()

response = client.get('/api/corpus/')
Expand Down
9 changes: 5 additions & 4 deletions documentation/First-time-setup.md
Original file line number Diff line number Diff line change
Expand Up @@ -73,11 +73,12 @@ The source files of a corpus are not included in this directory; ask another dev

Note: database-only corpora are still in development and not yet recommended for first-time users.

To add a database-only corpus, you will need a JSON definition of the corpus, and a directory with (a sample of) the pre-processed source data. To retrieve a JSON definition from a running I-analyzer server, visit `/api/corpus/edit/` and copy the JSON of the corpus you want to import.
To add a database-only corpus, you will need a JSON definition of the corpus, and a directory with (a sample of) the pre-processed source data. To retrieve a JSON definition from a running I-analyzer server, log in as a staff user and visit `/corpus-definitions/`. Open the corpus you want to import and click "Download JSON".

1. Start up your I-analyzer server. Make a POST request to `localhost:8000/api/corpus/edit/` (you can use the browsable API for this) to import the JSON definition.
2. Visit the admin menu (`localhost:8000/admin`). Go to "corpus configurations" and select your corpus. In the "data directory" field, add the path to your source data directory.
3. Activate your python virutal environment. Then create an ElasticSearch index from the source files by running, e.g., `yarn django index dutchannualreports`, for indexing the Dutch Annual Reports corpus in a development environment. See [Indexing](documentation/Indexing-corpora.md) for more information.
1. Start up your I-analyzer server and log in as a staff user. Go to `localhost:4200/corpus-definitions/new`. Upload the JSON definition file and save.
2. Visit the admin menu (`localhost:4200/admin`). Go to "corpus configurations" and select your corpus. In the "data directory" field, add the path to your source data directory.
3. Activate your python virtual environment. Create an ElasticSearch index from the source files by running `yarn django index {corpusname}`. See [Indexing](documentation/Indexing-corpora.md) for more information.
4. Visit the admin menu again. Go to "corpora" and select te corpus. Set "active" to true and save.


## Running a dev environment
Expand Down
2 changes: 1 addition & 1 deletion documentation/Writing-a-corpus-definition-in-JSON.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ The format is defined in [corpus.schema.json](/backend/addcorpus/schemas/corpus.

## Importing and exporting definitions

Currently, importing and exporting JSON definitions is only supported through the backend API.
You can import and export JSON definitions through the frontend. Visit `/corpus-definitions/` to do so.

Some notes on importing and exporting JSON definitions:

Expand Down
23 changes: 23 additions & 0 deletions frontend/src/app/app.module.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@ import {

import { AboutComponent } from './about/about.component';
import { AppComponent } from './app.component';
import { CorpusDefinitionsModule } from './corpus-definitions/corpus-definitions.module';
import { CreateDefinitionComponent } from './corpus-definitions/create-definition/create-definition.component';
import { DefinitionsOverviewComponent } from './corpus-definitions/definitions-overview/definitions-overview.component';
import { EditDefinitionComponent } from './corpus-definitions/edit-definition/edit-definition.component';
import { CorpusModule } from './corpus-header/corpus.module';
import { CorpusInfoComponent } from './corpus-info/corpus-info.component';
import { CorpusSelectionModule } from './corpus-selection/corpus-selection.module';
Expand Down Expand Up @@ -131,6 +135,24 @@ export const appRoutes: Routes = [
component: TagOverviewComponent,
canActivate: [LoggedOnGuard],
},
{
path: 'corpus-definitions',
canActivate: [LoggedOnGuard],
children: [
{
path: 'new',
component: CreateDefinitionComponent,
},
{
path: 'edit/:corpusID',
component: EditDefinitionComponent,
},
{
path: '',
component: DefinitionsOverviewComponent,
},
]
},
{
path: '',
redirectTo: 'home',
Expand All @@ -156,6 +178,7 @@ export const imports: any[] = [
SharedModule,
// Feature Modules
CorpusModule,
CorpusDefinitionsModule,
CorpusSelectionModule,
DialogModule,
DocumentModule,
Expand Down
26 changes: 26 additions & 0 deletions frontend/src/app/corpus-definitions/corpus-definitions.module.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import { NgModule } from '@angular/core';
import { DefinitionsOverviewComponent } from './definitions-overview/definitions-overview.component';
import { SharedModule } from '../shared/shared.module';
import { CreateDefinitionComponent } from './create-definition/create-definition.component';
import { EditDefinitionComponent } from './edit-definition/edit-definition.component';
import { DefinitionJsonUploadComponent } from './definition-json-upload/definition-json-upload.component';



@NgModule({
declarations: [
CreateDefinitionComponent,
DefinitionsOverviewComponent,
EditDefinitionComponent,
DefinitionJsonUploadComponent,
],
exports: [
CreateDefinitionComponent,
DefinitionsOverviewComponent,
EditDefinitionComponent,
],
imports: [
SharedModule
]
})
export class CorpusDefinitionsModule { }
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
<section class="section">
<div class="container">
<nav aria-label="secondary navigation" class="breadcrumb">
<ul>
<li><a [routerLink]="['/']">Corpora</a></li>
<li><a [routerLink]="['..']">Definitions</a></li>
<li class="is-active" aria-current="page"><a>New corpus</a></li>
</ul>
</nav>

<h1 class="title">New corpus</h1>

<p class="block">
Upload a JSON definition file to add it as a corpus.
</p>

<form>
<div class="block">
<ia-definition-json-upload
[reset]="reset$" (upload)="onJSONUpload($event)">
</ia-definition-json-upload>
</div>

<div class="block" *ngIf="corpus.isComplete()">
<button class="button is-primary" type="submit" (click)="submit()">
<span class="icon" aria-hidden="true">
<fa-icon [icon]="formIcons.confirm"></fa-icon>
</span>
<span>
Save corpus
</span>
</button>
</div>

<div class="message is-danger" *ngIf="error">
<div class="message-body">
Could not save corpus: {{error.message}}
</div>
</div>
</form>
</div>
</section>
Loading
Loading