Skip to content

Commit

Permalink
Merge branch 'hotfix/0.9.1'
Browse files Browse the repository at this point in the history
  • Loading branch information
JeltevanBoheemen committed May 28, 2024
2 parents 2e4c876 + 2a64e67 commit 2e221ca
Show file tree
Hide file tree
Showing 46 changed files with 3,896 additions and 5,356 deletions.
2 changes: 1 addition & 1 deletion CITATION.cff
Original file line number Diff line number Diff line change
Expand Up @@ -41,5 +41,5 @@ abstract: >-
transcripts, to aid clinical linguists and research into
language development and language disorders.
license: BSD-3-Clause
version: 0.9.0
version: 0.9.1
date-released: '2024-01-31'
93 changes: 0 additions & 93 deletions backend/analysis/conftest.py

This file was deleted.

4 changes: 2 additions & 2 deletions backend/analysis/convert/tests/convert_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,9 +79,9 @@ def test_quotemarks(quotemarks):
assert replace_quotation_marks(line) == expected


def test_chat_replacements(cha_testfiles_dir, tarsp_category):
def test_chat_replacements(testfiles_dir, tarsp_category):
'''Test if CHAT input handles replacements correctly'''
fn = op.join(cha_testfiles_dir, 'sample_1.cha')
fn = op.join(testfiles_dir, 'sample_1.cha')
doc = ChatDocument.from_chatfile(fn, tarsp_category)
line = doc.lines[1]

Expand Down
6 changes: 6 additions & 0 deletions backend/analysis/query/query_transcript.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from sastadev.methods import Method

from annotations.reader import read_saf
from parse.parse_utils import correct_transcript


def prepare_parameters(infilename: str, method: Method, targets: int, annotationinput: bool) -> SastaCoreParameters:
Expand All @@ -19,7 +20,12 @@ def prepare_parameters(infilename: str, method: Method, targets: int, annotation

def prepare_treebanks(transcript: Transcript) -> Tuple[Tuple[str, etree.ElementTree]]:
orig_fp = transcript.parsed_content.path

# TODO: FIX THIS PROPERLY
if not transcript.corrected_content:
correct_transcript(transcript)
corr_fp = transcript.corrected_content.path

orig_treebank = etree.parse(orig_fp).getroot()
corr_treebank = etree.parse(corr_fp).getroot()
return (
Expand Down
Empty file.
25 changes: 25 additions & 0 deletions backend/annotations/tests/chat_tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from annotations.writers.saf_chat import enrich_chat


def test_chat_enrich(single_utt_allresults, asta_transcripts):
'''Tests the CHAT enrichment functionality'''
transcript = asta_transcripts.get(name='single_utt')
doc = enrich_chat(transcript=transcript, allresults=single_utt_allresults,
method=transcript.corpus.default_method)

# Test the correct position of %xsyn annotations
assert doc.lines[0].tiers.get('xsyn') is None
assert doc.lines[1].tiers.get('xsyn') is not None


def test_chat_enrich_newids(single_utt_allresults, asta_transcripts, tmp_path):
'''Tests the CHAT enrichment using new Corpus2Alpino style
In this style, uttids are not overwritten by xsid.
'''
transcript = asta_transcripts.get(name='single_utt_newstyle')
doc = enrich_chat(transcript=transcript, allresults=single_utt_allresults,
method=transcript.corpus.default_method)

# Test the correct position of %xsyn annotations
assert doc.lines[0].tiers.get('xsyn') is None
assert doc.lines[1].tiers.get('xsyn') is not None
7 changes: 7 additions & 0 deletions backend/annotations/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,3 +62,10 @@ def autosize_columns(worksheet) -> None:
dim_holder[get_column_letter(col)] = ColumnDimension(
worksheet, min=col, max=col, auto_size=True)
worksheet.column_dimensions = dim_holder


def cast_to_bool(value) -> bool:
if isinstance(value, bool):
return value
elif isinstance(value, str):
return value == 'yes'
30 changes: 22 additions & 8 deletions backend/annotations/writers/saf_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from analysis.models import AssessmentMethod, Transcript
from analysis.results.results import AllResults
from chamd.chat_reader import ChatLine, ChatTier
from annotations.utils import cast_to_bool
from convert.chat_reader import ChatDocument
from natsort import natsorted
from sastadev.sastatypes import ExactResultsDict
Expand All @@ -18,14 +19,25 @@ def _items_by_utt_word(exactresults: ExactResultsDict, items_mapping: Dict) -> D

for (qid, _), hits in results.items():
for (utt_id, wordno) in hits:
out[utt_id][wordno].append(items_mapping.get(qid))
mapped = items_mapping.get(qid)
if mapped:
out[utt_id][wordno].append(mapped)

return out


def _find_doc_line(lines: List[ChatLine], uttno: int) -> ChatLine:
# TODO: more efficient way to do this?
return next((x for x in lines if x.uttid == uttno), None)
# return next((x for x in lines if x.uttno == uttno), None)
return lines[uttno - 1]


def find_doc_line_xsid(lines: List[ChatLine], xsid: int) -> ChatLine:
for x in lines:
line_xsid = x.tiers.get('xsid')
if line_xsid and line_xsid.text == str(xsid):
return x
return None


def enrich_chat(transcript: Transcript,
Expand All @@ -34,22 +46,24 @@ def enrich_chat(transcript: Transcript,
doc = ChatDocument.from_chatfile(
transcript.content.path, transcript.corpus.method_category)

target_ids = transcript.target_ids

# construct a mapping of uttno to uttid
# because uttid is unknown to CHAT
marked_utts = (x for x in transcript.utterances.all() if x.for_analysis)
id_no_mapping = {
u.utt_id: u.uttno for u in marked_utts
}

# create mapping of query_ids to items
items_mapping = {q.query_id: q.item for q in method.queries.all()}
items_mapping = {
q.query_id: q.item for q in method.queries.all() if cast_to_bool(q.inform)}

results_by_word = _items_by_utt_word(
allresults.exactresults, items_mapping)

for utt_id, words in results_by_word.items():
uttno = id_no_mapping.get(int(utt_id))
doc_line = _find_doc_line(doc.lines, uttno)
if target_ids:
doc_line = find_doc_line_xsid(doc.lines, int(utt_id))
else:
doc_line = _find_doc_line(doc.lines, int(utt_id))

utt_hits = []
for w in natsorted(words.keys()):
Expand Down
8 changes: 5 additions & 3 deletions backend/annotations/writers/saf_xlsx.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,9 @@
from sastadev.sastatypes import ExactResults
from annotations.constants import (POST_WORDS_HEADERS, PRE_WORDS_HEADERS,
SAF_COMMENT_LEVEL, SAF_UTT_LEVEL)
from annotations.utils import autosize_columns, format_worksheet, get_max_words, ljust
from annotations.utils import autosize_columns, cast_to_bool, format_worksheet, get_max_words, ljust
from natsort import natsorted


@dataclass
class SAFWriter():
method: Method
Expand Down Expand Up @@ -76,7 +75,10 @@ def _make_annotations_worksheet(self) -> Worksheet:

# Fill with values
for qid, qresults in self.results.exactresults.items():
self._fill_query(qid, qresults)
query = self.method.queries.get(qid[0])
inform = cast_to_bool(query.inform)
if inform:
self._fill_query(qid, qresults)
return self.anno_ws

def _annotations_header_row(self) -> List[str]:
Expand Down
116 changes: 98 additions & 18 deletions backend/conftest.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,90 @@
import glob
from collections import Counter
from os import path as op
import os

import pytest
from analysis.models import AssessmentMethod, MethodCategory
from analysis.convert.convert import convert
from analysis.models import AssessmentMethod, Corpus, MethodCategory, Transcript
from django.conf import settings
from django.core.files import File
from sastadev.allresults import AllResults
from sastadev.conf import settings as sd_settings

from lxml import etree

from parse.parse_utils import create_utterance_objects


def _get_transcript_filenames(name: str, dir: str):
return {
'chat': f'{name}.cha',
'parsed': f'{name}.xml',
'corrected': f'{name}_corrected.xml'
}


def _make_transcript(corpus: Corpus, name: str, dir: str):
filenames = _get_transcript_filenames(name, str)

obj = Transcript.objects.create(
name=name,
status=Transcript.PARSED,
corpus=corpus
)

with open(op.join(dir, filenames['chat']), 'rb') as f:
obj.content.save(filenames['chat'], File(f))

convert(obj)

with open(op.join(dir, filenames['parsed']), 'rb') as f:
obj.parsed_content.save(filenames['parsed'], File(f))
with open(op.join(dir, filenames['corrected']), 'rb') as f:
obj.corrected_content.save(filenames['corrected'], File(f))

create_utterance_objects(obj)

obj.save()
return obj


def _make_method_transcripts(corpus: Corpus, testfiles_dir):
method_name = corpus.method_category.name
method_dir = op.join(testfiles_dir, method_name)
transcript_dirs = os.listdir(method_dir)

for name in transcript_dirs:
_make_transcript(corpus, name, op.join(method_dir, name))

transcripts = corpus.transcripts.all()
assert transcripts.count() == len(transcript_dirs)
return transcripts


@pytest.fixture
def cha_testfiles_dir():
def testfiles_dir():
return op.join(settings.BASE_DIR, 'test_files')


@pytest.fixture
def tarsp_category(db):
obj = MethodCategory.objects.create(
return MethodCategory.objects.create(
name='TARSP', zc_embeddings=True,
levels=['Sz', 'Zc', 'Wg', 'VVW'],
marking_postcodes=['[+ G]'])
yield obj
obj.delete()


@pytest.fixture
def tarsp_corpus(db, admin_user, tarsp_method, tarsp_category):
obj = Corpus.objects.create(
user=admin_user,
name='tarsp_test_corpus',
status='created',
default_method=tarsp_method,
method_category=tarsp_category
)
return obj


@pytest.fixture
Expand All @@ -33,22 +93,30 @@ def stap_category(db):
name='STAP', zc_embeddings=False,
levels=['Complexiteit', 'Grammaticale fout'],
marking_postcodes=['[+ G]', '[+ VU]'])
yield obj
obj.delete()
return obj


@pytest.fixture
def asta_category(db):
obj = MethodCategory.objects.create(
return MethodCategory.objects.create(
name='ASTA', zc_embeddings=False, levels=[
"Samplegrootte",
"MLU",
"Taalmaat",
"Foutenanalyse",
"Lemma"
], marking_postcodes=["[+ G]"])
yield obj
obj.delete()


@pytest.fixture
def asta_corpus(db, admin_user, asta_method, asta_category):
return Corpus.objects.create(
user=admin_user,
name='asta_test_corpus',
status='created',
default_method=asta_method,
method_category=asta_category
)


@pytest.fixture
Expand All @@ -58,14 +126,13 @@ def method_dir():

@pytest.fixture
def tarsp_method(db, tarsp_category, method_dir):
file = glob.glob(f'{method_dir}/TARSP Index Current.xlsx')[0]
file = glob.glob(f'{method_dir}/TARSP_Index_Current.xlsx')[0]
with open(file, 'rb') as f:
wrapped_file = File(f)
instance = AssessmentMethod(
name='tarsp_test_method', category=tarsp_category)
instance.content.save(op.basename(file), wrapped_file)
yield instance
instance.delete()
return instance


@pytest.fixture
Expand All @@ -76,14 +143,23 @@ def asta_method(db, asta_category, method_dir):
instance = AssessmentMethod(
name='asta_test_method', category=asta_category)
instance.content.save(op.basename(file), wrapped_file)
yield instance
instance.delete()
return instance


@pytest.fixture(autouse=True)
def asta_transcripts(db, asta_corpus, testfiles_dir):
return _make_method_transcripts(asta_corpus, testfiles_dir)


@pytest.fixture(autouse=True)
def tarsp_transcripts(db, tarsp_corpus, testfiles_dir):
return _make_method_transcripts(tarsp_corpus, testfiles_dir)


@pytest.fixture
def single_utt_allresults(cha_testfiles_dir):
def single_utt_allresults(testfiles_dir):
parsed = etree.parse(
op.join(cha_testfiles_dir, 'single_utt_corrected.xml'))
op.join(testfiles_dir, 'ASTA', 'single_utt', 'single_utt_corrected.xml'))
utts = parsed.xpath('alpino_ds')

return AllResults(
Expand Down Expand Up @@ -111,5 +187,9 @@ def single_utt_allresults(cha_testfiles_dir):
allutts={1: ['ja', 'uh', 'ik', 'vind', 'het', 'beetje', 'moeilijk',
'om', 'het', 'goed', 'te', 'vertellen', 'want', 'ik',
'heb', 'een', 'ongeluk', 'gehad']}

)


@pytest.fixture
def all_transcripts(asta_transcripts, tarsp_transcripts):
return Transcript.objects.all()
Loading

0 comments on commit 2e221ca

Please sign in to comment.