Skip to content

Commit

Permalink
FoLiA edge cases
Browse files Browse the repository at this point in the history
  • Loading branch information
oktaal committed Apr 3, 2024
1 parent 06fec52 commit 8d9db05
Show file tree
Hide file tree
Showing 4 changed files with 110 additions and 87 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@ on: [push]
jobs:
build:

runs-on: ubuntu-18.04
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ['3.7', '3.10']
python-version: ['3.8', '3.10']

steps:
- uses: actions/checkout@v3
Expand Down
4 changes: 2 additions & 2 deletions corpus2alpino/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def main(args=None):
parser.add_argument(
'-p', '--progress',
action='store_true',
help='Show progress bar, automatically turned on file output')
help='Show progress bar, automatically turned on for file output')
parser.add_argument('-t', '--split_treebanks',
action='store_true',
help='Split treebanks to separate files')
Expand Down Expand Up @@ -88,7 +88,7 @@ def main(args=None):
converter.target = FilesystemTarget(
options.output_path, not options.split_treebanks)

show_progress = options.progress if options.progress != None else options.output_path != None
show_progress = options.output_path != None or options.progress

if show_progress:
with tqdm(converter.convert(), total=len(options.file_names), unit='file') as progress:
Expand Down
68 changes: 42 additions & 26 deletions corpus2alpino/readers/folia.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,12 @@
from typing import Iterable

from corpus2alpino.abstracts import Reader
from corpus2alpino.models import (CollectedFile, Document, MetadataValue,
Utterance)
from corpus2alpino.models import CollectedFile, Document, MetadataValue, Utterance
from corpus2alpino.readers.tokenizer import Tokenizer

import folia.main as folia

from .alpino_brackets import (escape_id, escape_word, format_add_lex,
format_folia)
from .alpino_brackets import escape_id, escape_word, format_add_lex, format_folia


class FoliaReader(Reader):
Expand All @@ -26,41 +24,54 @@ def __init__(self, custom_tokenizer=None) -> None:

def read(self, collected_file: CollectedFile) -> Iterable[Document]:
try:
doc = folia.Document(string=collected_file.content,
autodeclare=True,
loadsetdefinitions=False)
doc = folia.Document(
string=collected_file.content,
autodeclare=True,
loadsetdefinitions=False,
)
self.tokenize(doc)
doc_metadata = self.get_metadata_dict(doc.metadata.items())

yield Document(collected_file,
list(self.get_utterances(doc, doc_metadata)),
doc_metadata)
yield Document(
collected_file,
list(self.get_utterances(doc, doc_metadata)),
doc_metadata,
)
except Exception as e:
raise Exception(collected_file.relpath + "/" +
collected_file.filename) from e
raise Exception(
collected_file.relpath + "/" + collected_file.filename
) from e

def tokenize(self, element):
"""
Tokenizes all the text which isn't tokenized yet.
"""
if len(element) == 0:
# no sub elements
if isinstance(element, folia.Text):
self.tokenize_element(element.text(), element)
return

for item in element:
if isinstance(item, folia.AbstractElement):
if isinstance(item, folia.Paragraph):
for sentence in item.sentences():
for _ in item.sentences():
break
else:
self.tokenize_paragraph(item)
else:
self.tokenize(item)

def tokenize_paragraph(self, paragraph):
text = ''
for textContent in paragraph.select(folia.TextContent):
text += textContent.text()
def tokenize_paragraph(self, paragraph: folia.Paragraph):
text = ""
for text_content in paragraph.select(folia.TextContent):
text += text_content.text()
self.tokenize_element(text, paragraph)

def tokenize_element(self, text: str, element: folia.AbstractElement):
sentences = self.tokenizer.process(text)
for line in sentences:
sentence = paragraph.add(folia.Sentence)
sentence = element.add(folia.Sentence)
for word in line.tokens():
if word:
sentence.add(folia.Word, word)
Expand Down Expand Up @@ -88,7 +99,9 @@ def get_utterances(self, doc, doc_metadata):
if word_sentence != sentence or word_paragraph != paragraph:
if words:
if sentence or paragraph:
yield self.create_utterance(paragraph, sentence, words, doc_metadata)
yield self.create_utterance(
paragraph, sentence, words, doc_metadata
)
words = []
sentence = word_sentence
paragraph = word_paragraph
Expand All @@ -104,7 +117,7 @@ def create_utterance(self, paragraph, sentence, words, doc_metadata):
"""

word_strings = map(lambda word: self.get_word_string(word), words)
line = " ".join(filter(lambda word: word != '', word_strings))
line = " ".join(filter(lambda word: word != "", word_strings))

if sentence:
container = sentence
Expand All @@ -113,8 +126,8 @@ def create_utterance(self, paragraph, sentence, words, doc_metadata):

sentence_id = escape_id(container.id)
sentence_metadata = self.get_metadata_dict(
container.getmetadata().items(),
doc_metadata)
container.getmetadata().items(), doc_metadata
)

return Utterance(line, sentence_id, sentence_metadata, line)

Expand All @@ -135,7 +148,7 @@ def get_word_string(self, word):
text = item.text()
break
else:
return ''
return ""

try:
correction = word.getcorrection()
Expand All @@ -159,8 +172,11 @@ def get_word_string(self, word):
def get_metadata_dict(self, native_metadata, filter_by=None):
metadata = {}
for key, value in native_metadata:
if filter_by == None or not key in filter_by \
or filter_by[key].value != value:
if (
filter_by == None
or key not in filter_by
or filter_by[key].value != value
):
metadata[key] = MetadataValue(value)
return metadata

Expand All @@ -169,4 +185,4 @@ def test_file(self, file: CollectedFile):
Determine whether this is a FoLiA XML file
"""

return '<FoLiA' in file.content[0:400]
return "<FoLiA" in file.content[0:400]
121 changes: 64 additions & 57 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,133 +1,140 @@
#
# This file is autogenerated by pip-compile with python 3.7
# This file is autogenerated by pip-compile with python 3.10
# To update, run:
#
# pip-compile
#
annotated-types==0.6.0
# via pydantic
argparse==1.4.0
# via corpus2alpino (setup.py)
beautifulsoup4==4.10.0
beautifulsoup4==4.12.3
# via tei-reader
blis==0.7.5
# via
# spacy
# thinc
catalogue==2.0.6
blis==0.7.11
# via thinc
catalogue==2.0.10
# via
# spacy
# srsly
# thinc
certifi==2021.10.8
certifi==2024.2.2
# via requests
chamd==0.5.8
chamd==0.5.12
# via corpus2alpino (setup.py)
charset-normalizer==2.0.10
charset-normalizer==3.3.2
# via requests
click==8.0.3
click==8.1.7
# via typer
cymem==2.0.6
cloudpathlib==0.16.0
# via weasel
confection==0.1.4
# via
# thinc
# weasel
cymem==2.0.8
# via
# preshed
# spacy
# thinc
folia==2.5.7
folia==2.5.11
# via corpus2alpino (setup.py)
idna==3.3
idna==3.6
# via requests
importlib-metadata==4.12.0
# via
# click
# rdflib
isodate==0.6.1
# via rdflib
jinja2==3.0.3
jinja2==3.1.3
# via spacy
langcodes==3.3.0
# via spacy
lxml==4.7.1
lxml==5.1.0
# via
# folia
# tei-reader
markupsafe==2.0.1
markupsafe==2.1.5
# via jinja2
murmurhash==1.0.6
murmurhash==1.0.10
# via
# preshed
# spacy
# thinc
numpy==1.21.5
numpy==1.24.4
# via
# blis
# spacy
# thinc
packaging==21.3
# via spacy
pathy==0.6.1
# via spacy
preshed==3.0.6
packaging==24.0
# via
# spacy
# thinc
pydantic==1.8.2
# weasel
preshed==3.0.9
# via
# spacy
# thinc
pyparsing==3.0.7
pydantic==2.6.4
# via
# packaging
# rdflib
rdflib==6.1.1
# confection
# spacy
# thinc
# weasel
pydantic-core==2.16.3
# via pydantic
pyparsing==3.1.2
# via rdflib
rdflib==7.0.0
# via folia
requests==2.27.1
requests==2.31.0
# via
# folia
# spacy
# weasel
six==1.16.0
# via isodate
smart-open==5.2.1
# via pathy
soupsieve==2.3.1
smart-open==6.4.0
# via
# spacy
# weasel
soupsieve==2.5
# via beautifulsoup4
spacy==3.2.1
spacy==3.7.4
# via corpus2alpino (setup.py)
spacy-legacy==3.0.8
spacy-legacy==3.0.12
# via spacy
spacy-loggers==1.0.1
spacy-loggers==1.0.5
# via spacy
srsly==2.4.2
srsly==2.4.8
# via
# confection
# spacy
# thinc
# weasel
tei-reader==0.0.17
# via corpus2alpino (setup.py)
thinc==8.0.13
thinc==8.2.3
# via spacy
tqdm==4.62.3
tqdm==4.66.2
# via
# corpus2alpino (setup.py)
# spacy
typer==0.4.0
typer==0.9.0
# via
# pathy
# spacy
typing-extensions==3.10.0.2
# weasel
typing-extensions==4.10.0
# via
# catalogue
# importlib-metadata
# cloudpathlib
# pydantic
# spacy
# thinc
urllib3==1.26.8
# pydantic-core
# typer
urllib3==2.2.1
# via requests
wasabi==0.9.0
wasabi==1.1.2
# via
# spacy
# spacy-loggers
# thinc
zipp==3.8.0
# via
# catalogue
# importlib-metadata
# weasel
weasel==0.3.4
# via spacy

# The following packages are considered to be unsafe in a requirements file:
# setuptools

0 comments on commit 8d9db05

Please sign in to comment.