From 517b74143612854e964821effc564f818bc9dc01 Mon Sep 17 00:00:00 2001 From: Tenzin Date: Mon, 1 Jul 2024 15:08:03 +0530 Subject: [PATCH 01/34] modify/set file path type as Path --- src/openpecha/alignment/parsers/plaintext.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/openpecha/alignment/parsers/plaintext.py b/src/openpecha/alignment/parsers/plaintext.py index a97d60a..610f0dd 100644 --- a/src/openpecha/alignment/parsers/plaintext.py +++ b/src/openpecha/alignment/parsers/plaintext.py @@ -1,15 +1,18 @@ +from pathlib import Path + + class PlainText: def __init__(self, source_text: str, target_text: str): self.source_text = source_text - self.traget_text = target_text + self.target_text = target_text @classmethod - def from_files(cls, source_path: str, target_path: str): - source_text = open(source_path).read() - target_text = open(target_path).read() + def from_files(cls, source_path: Path, target_path: Path): + source_text = source_path.read_text(encoding="utf-8") + target_text = target_path.read_text(encoding="utf-8") return cls(source_text, target_text) - def parse(self, metadata: dict): + def parse(self, metadata: dict = None): # source_segments = self.source_text.split("\n") # target_segments = self.target_text.split("\n") From 3836ad4ed8869c2302f8d351a7853e100d11c6ad Mon Sep 17 00:00:00 2001 From: Tenzin Date: Mon, 1 Jul 2024 15:16:46 +0530 Subject: [PATCH 02/34] set project_name as "openpecha" --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index a4cf13f..8a3e3c5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,12 +3,12 @@ requires = ["setuptools>=61.0"] build-backend = "setuptools.build_meta" [project] -name = "project_name" +name = "openpecha" version = "0.0.1" authors = [ { name="OpenPecha", email="dev@openpecha.org" }, ] -description = "A small example package" +description = "OpenPecha toolkit version 2" readme = "README.md" requires-python = ">=3.8" classifiers = [ From 54df02a3e1b34aab570717e0d919d156097216f1 Mon Sep 17 00:00:00 2001 From: Tenzin Date: Mon, 1 Jul 2024 15:58:08 +0530 Subject: [PATCH 03/34] create/class Pecha --- src/openpecha/alignment/parsers/plaintext.py | 20 ++++++++++++++++--- src/openpecha/alignment/pecha/pecha.py | 2 -- .../{alignment => }/pecha/__init__.py | 0 src/openpecha/pecha/pecha.py | 16 +++++++++++++++ 4 files changed, 33 insertions(+), 5 deletions(-) delete mode 100644 src/openpecha/alignment/pecha/pecha.py rename src/openpecha/{alignment => }/pecha/__init__.py (100%) create mode 100644 src/openpecha/pecha/pecha.py diff --git a/src/openpecha/alignment/parsers/plaintext.py b/src/openpecha/alignment/parsers/plaintext.py index 610f0dd..83173ad 100644 --- a/src/openpecha/alignment/parsers/plaintext.py +++ b/src/openpecha/alignment/parsers/plaintext.py @@ -1,5 +1,8 @@ from pathlib import Path +from openpecha.ids import get_initial_pecha_id, get_uuid +from openpecha.pecha.pecha import Pecha + class PlainText: def __init__(self, source_text: str, target_text: str): @@ -13,11 +16,22 @@ def from_files(cls, source_path: Path, target_path: Path): return cls(source_text, target_text) def parse(self, metadata: dict = None): - # source_segments = self.source_text.split("\n") - # target_segments = self.target_text.split("\n") + source_text_lines = self.source_text.split("\n") + target_text_lines = self.target_text.split("\n") + + """ prepare the data for pecha creation""" + source_pecha_id, target_pecha_id = ( + get_initial_pecha_id(), + get_initial_pecha_id(), + ) + source_segments = {get_uuid(): segment for segment in source_text_lines} + target_segments = {get_uuid(): segment for segment in target_text_lines} + + source_pecha = Pecha(source_pecha_id, source_segments) # noqa + target_pecha = Pecha(target_pecha_id, target_segments) # noqa # TODO: - # 1. Create pecha with segment layers for source and target text + # 2. create a segment pairs [((source_pecha_id,source_segment_id), (target_pecha_id, target_segment_id)), ...] # 3. Create AlignmentMetadata diff --git a/src/openpecha/alignment/pecha/pecha.py b/src/openpecha/alignment/pecha/pecha.py deleted file mode 100644 index cbffb86..0000000 --- a/src/openpecha/alignment/pecha/pecha.py +++ /dev/null @@ -1,2 +0,0 @@ -class Pecha: - pass diff --git a/src/openpecha/alignment/pecha/__init__.py b/src/openpecha/pecha/__init__.py similarity index 100% rename from src/openpecha/alignment/pecha/__init__.py rename to src/openpecha/pecha/__init__.py diff --git a/src/openpecha/pecha/pecha.py b/src/openpecha/pecha/pecha.py new file mode 100644 index 0000000..9d69554 --- /dev/null +++ b/src/openpecha/pecha/pecha.py @@ -0,0 +1,16 @@ +from typing import Dict + + +class Pecha: + def __init__(self, pecha_id: str, segments: Dict[str, str]) -> None: + self.pecha_id = pecha_id + self.segments = segments + self.base_text = "".join(segments.values()) + + @classmethod + def from_path(cls, path: str): + pass + + @classmethod + def from_id(cls, pecha_id: str): + pass From 1adcc16c5e8a8778bcf6ab5bde2de4afe0e2e7c2 Mon Sep 17 00:00:00 2001 From: Tenzin Date: Mon, 1 Jul 2024 16:05:15 +0530 Subject: [PATCH 04/34] modify/move alignment pecha codes to its __init__.py --- src/openpecha/alignment/__init__.py | 36 ++++++++++++++++++++ src/openpecha/alignment/alignment.py | 36 -------------------- src/openpecha/alignment/parsers/plaintext.py | 2 +- src/openpecha/pecha/__init__.py | 16 +++++++++ src/openpecha/pecha/pecha.py | 16 --------- 5 files changed, 53 insertions(+), 53 deletions(-) delete mode 100644 src/openpecha/alignment/alignment.py delete mode 100644 src/openpecha/pecha/pecha.py diff --git a/src/openpecha/alignment/__init__.py b/src/openpecha/alignment/__init__.py index e69de29..5f769d7 100644 --- a/src/openpecha/alignment/__init__.py +++ b/src/openpecha/alignment/__init__.py @@ -0,0 +1,36 @@ +from typing import List, Tuple + + +class AlignmentMetadata: + pass + + +class Alignment: + def __init__( + self, + metadata: AlignmentMetadata, + parser_segment_pairs=None, + alignment_segment_pairs=None, + ): + self.metadata = metadata + self.parser_segment_pairs = parser_segment_pairs + self.alignment_segment_pairs = alignment_segment_pairs + + @classmethod + def from_path(cls, path: str): + pass + + @classmethod + def from_id(cls, alignment_id: str): + pass + + @classmethod + def from_segment_pairs( + cls, + segment_pairs: List[Tuple[Tuple[str, str], Tuple[str, str]]], + metadata: AlignmentMetadata, + ): + return cls(metadata=metadata, parser_segment_pairs=segment_pairs) + + def save(self, path: str): + pass diff --git a/src/openpecha/alignment/alignment.py b/src/openpecha/alignment/alignment.py deleted file mode 100644 index 5f769d7..0000000 --- a/src/openpecha/alignment/alignment.py +++ /dev/null @@ -1,36 +0,0 @@ -from typing import List, Tuple - - -class AlignmentMetadata: - pass - - -class Alignment: - def __init__( - self, - metadata: AlignmentMetadata, - parser_segment_pairs=None, - alignment_segment_pairs=None, - ): - self.metadata = metadata - self.parser_segment_pairs = parser_segment_pairs - self.alignment_segment_pairs = alignment_segment_pairs - - @classmethod - def from_path(cls, path: str): - pass - - @classmethod - def from_id(cls, alignment_id: str): - pass - - @classmethod - def from_segment_pairs( - cls, - segment_pairs: List[Tuple[Tuple[str, str], Tuple[str, str]]], - metadata: AlignmentMetadata, - ): - return cls(metadata=metadata, parser_segment_pairs=segment_pairs) - - def save(self, path: str): - pass diff --git a/src/openpecha/alignment/parsers/plaintext.py b/src/openpecha/alignment/parsers/plaintext.py index 83173ad..ada0e63 100644 --- a/src/openpecha/alignment/parsers/plaintext.py +++ b/src/openpecha/alignment/parsers/plaintext.py @@ -1,7 +1,7 @@ from pathlib import Path from openpecha.ids import get_initial_pecha_id, get_uuid -from openpecha.pecha.pecha import Pecha +from openpecha.pecha import Pecha class PlainText: diff --git a/src/openpecha/pecha/__init__.py b/src/openpecha/pecha/__init__.py index e69de29..9d69554 100644 --- a/src/openpecha/pecha/__init__.py +++ b/src/openpecha/pecha/__init__.py @@ -0,0 +1,16 @@ +from typing import Dict + + +class Pecha: + def __init__(self, pecha_id: str, segments: Dict[str, str]) -> None: + self.pecha_id = pecha_id + self.segments = segments + self.base_text = "".join(segments.values()) + + @classmethod + def from_path(cls, path: str): + pass + + @classmethod + def from_id(cls, pecha_id: str): + pass diff --git a/src/openpecha/pecha/pecha.py b/src/openpecha/pecha/pecha.py deleted file mode 100644 index 9d69554..0000000 --- a/src/openpecha/pecha/pecha.py +++ /dev/null @@ -1,16 +0,0 @@ -from typing import Dict - - -class Pecha: - def __init__(self, pecha_id: str, segments: Dict[str, str]) -> None: - self.pecha_id = pecha_id - self.segments = segments - self.base_text = "".join(segments.values()) - - @classmethod - def from_path(cls, path: str): - pass - - @classmethod - def from_id(cls, pecha_id: str): - pass From fce29b2ba93df059c1bd9f473949695c0b6882c2 Mon Sep 17 00:00:00 2001 From: Tenzin Date: Mon, 1 Jul 2024 16:34:58 +0530 Subject: [PATCH 05/34] create/class Annotation --- pyproject.toml | 5 +++++ src/openpecha/pecha/__init__.py | 15 +++++++++++++++ src/openpecha/pecha/annotation.py | 16 ++++++++++++++++ 3 files changed, 36 insertions(+) create mode 100644 src/openpecha/pecha/annotation.py diff --git a/pyproject.toml b/pyproject.toml index 8a3e3c5..91d18ef 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,6 +17,11 @@ classifiers = [ "Operating System :: OS Independent", ] +dependencies = [ + "pydantic >= 2.7.4", + +] + [project.optional-dependencies] dev = [ "pytest", diff --git a/src/openpecha/pecha/__init__.py b/src/openpecha/pecha/__init__.py index 9d69554..00c6480 100644 --- a/src/openpecha/pecha/__init__.py +++ b/src/openpecha/pecha/__init__.py @@ -1,11 +1,14 @@ from typing import Dict +from openpecha.pecha.annotation import Annotation + class Pecha: def __init__(self, pecha_id: str, segments: Dict[str, str]) -> None: self.pecha_id = pecha_id self.segments = segments self.base_text = "".join(segments.values()) + self.annotations = self.build_annotations() @classmethod def from_path(cls, path: str): @@ -14,3 +17,15 @@ def from_path(cls, path: str): @classmethod def from_id(cls, pecha_id: str): pass + + def build_annotations(self): + char_count = 0 + for segment_id, segment in self.segments.items(): + annotation = Annotation( + annotation_id=segment_id, + segment=segment, + start=char_count, + end=char_count + len(segment), + ) + char_count += len(segment) + yield annotation diff --git a/src/openpecha/pecha/annotation.py b/src/openpecha/pecha/annotation.py new file mode 100644 index 0000000..138adf7 --- /dev/null +++ b/src/openpecha/pecha/annotation.py @@ -0,0 +1,16 @@ +from pydantic import BaseModel, Field, ValidationInfo, field_validator + + +class Annotation(BaseModel): + annotation_id: str + segment: str + start: int = Field(ge=0) + end: int = Field(ge=0) + metadata: dict = Field(default_factory=dict) + + @field_validator("end") + @classmethod + def end_must_not_be_less_than_start(cls, v: int, values: ValidationInfo) -> int: + if "start" in values.data and v < values.data["start"]: + raise ValueError("Span end must not be less than start") + return v From ac2922727e34c842c2dedd9e8485a73a5f96a891 Mon Sep 17 00:00:00 2001 From: Tenzin Date: Tue, 2 Jul 2024 09:23:17 +0530 Subject: [PATCH 06/34] rename function/to set_annotations --- src/openpecha/pecha/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/openpecha/pecha/__init__.py b/src/openpecha/pecha/__init__.py index 00c6480..0781347 100644 --- a/src/openpecha/pecha/__init__.py +++ b/src/openpecha/pecha/__init__.py @@ -8,7 +8,7 @@ def __init__(self, pecha_id: str, segments: Dict[str, str]) -> None: self.pecha_id = pecha_id self.segments = segments self.base_text = "".join(segments.values()) - self.annotations = self.build_annotations() + self.annotations = self.set_annotations() @classmethod def from_path(cls, path: str): @@ -18,7 +18,7 @@ def from_path(cls, path: str): def from_id(cls, pecha_id: str): pass - def build_annotations(self): + def set_annotations(self): char_count = 0 for segment_id, segment in self.segments.items(): annotation = Annotation( From 816934853c910ae9feef17b518d9124527d393f8 Mon Sep 17 00:00:00 2001 From: Tenzin Date: Tue, 2 Jul 2024 10:31:22 +0530 Subject: [PATCH 07/34] Pecha function write annotations --- pyproject.toml | 1 + src/openpecha/alignment/parsers/plaintext.py | 17 +++++---- src/openpecha/pecha/__init__.py | 38 ++++++++++++++++++-- src/openpecha/pecha/annotation.py | 2 +- 4 files changed, 49 insertions(+), 9 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 91d18ef..8f147a9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,6 +19,7 @@ classifiers = [ dependencies = [ "pydantic >= 2.7.4", + "stam == 0.8.2", ] diff --git a/src/openpecha/alignment/parsers/plaintext.py b/src/openpecha/alignment/parsers/plaintext.py index ada0e63..5243da2 100644 --- a/src/openpecha/alignment/parsers/plaintext.py +++ b/src/openpecha/alignment/parsers/plaintext.py @@ -5,17 +5,18 @@ class PlainText: - def __init__(self, source_text: str, target_text: str): + def __init__(self, source_text: str, target_text: str, metadata: dict): self.source_text = source_text self.target_text = target_text + self.metadata = metadata @classmethod - def from_files(cls, source_path: Path, target_path: Path): + def from_files(cls, source_path: Path, target_path: Path, metadata: dict): source_text = source_path.read_text(encoding="utf-8") target_text = target_path.read_text(encoding="utf-8") - return cls(source_text, target_text) + return cls(source_text, target_text, metadata) - def parse(self, metadata: dict = None): + def parse(self): source_text_lines = self.source_text.split("\n") target_text_lines = self.target_text.split("\n") @@ -27,8 +28,12 @@ def parse(self, metadata: dict = None): source_segments = {get_uuid(): segment for segment in source_text_lines} target_segments = {get_uuid(): segment for segment in target_text_lines} - source_pecha = Pecha(source_pecha_id, source_segments) # noqa - target_pecha = Pecha(target_pecha_id, target_segments) # noqa + source_pecha = Pecha( # noqa + source_pecha_id, source_segments, self.metadata["source"] + ) + target_pecha = Pecha( # noqa + target_pecha_id, target_segments, self.metadata["target"] + ) # TODO: diff --git a/src/openpecha/pecha/__init__.py b/src/openpecha/pecha/__init__.py index 0781347..bfd82b5 100644 --- a/src/openpecha/pecha/__init__.py +++ b/src/openpecha/pecha/__init__.py @@ -1,14 +1,21 @@ from typing import Dict +from stam import AnnotationStore, Offset, Selector + +from openpecha.ids import get_uuid from openpecha.pecha.annotation import Annotation class Pecha: - def __init__(self, pecha_id: str, segments: Dict[str, str]) -> None: + def __init__( + self, pecha_id: str, segments: Dict[str, str], metadata: Dict[str, str] + ) -> None: self.pecha_id = pecha_id self.segments = segments + self.metadata = metadata self.base_text = "".join(segments.values()) self.annotations = self.set_annotations() + self.write_annotations() @classmethod def from_path(cls, path: str): @@ -22,10 +29,37 @@ def set_annotations(self): char_count = 0 for segment_id, segment in self.segments.items(): annotation = Annotation( - annotation_id=segment_id, + id_=segment_id, segment=segment, start=char_count, end=char_count + len(segment), ) char_count += len(segment) yield annotation + + def write_annotations(self): + """write annotations in stam data model""" + self.annotation_store = AnnotationStore(id="PechaAnnotationStore") + self.resource = self.annotation_store.add_resource( + id=self.pecha_id, filename="random file path" + ) # in case of having layers, resource_id will be pecha_id_layer_id + self.dataset = self.annotation_store.add_dataset(id="PechaDataSet") + self.dataset.add_key(self.metadata["annotation_category"]) + for annotation in self.annotations: + target = Selector.textselector( + self.resource, + Offset.simple(annotation.start, annotation.end), + ) + data = [ + { + "id": annotation.id_, + "key": self.metadata["annotation_category"], + "value": self.metadata["annotation_label"], + "set": self.dataset.id(), + } + ] + self.annotation_store.add_annotation( + id=annotation.id_, + target=target, + data=data, + ) diff --git a/src/openpecha/pecha/annotation.py b/src/openpecha/pecha/annotation.py index 138adf7..99c2132 100644 --- a/src/openpecha/pecha/annotation.py +++ b/src/openpecha/pecha/annotation.py @@ -2,7 +2,7 @@ class Annotation(BaseModel): - annotation_id: str + id_: str segment: str start: int = Field(ge=0) end: int = Field(ge=0) From 5ea7922695752b76cad2760512e84c30e7665410 Mon Sep 17 00:00:00 2001 From: Tenzin Date: Tue, 2 Jul 2024 11:16:30 +0530 Subject: [PATCH 08/34] Pecha function create pecha folder --- src/openpecha/config.py | 12 ++++++++++++ src/openpecha/pecha/__init__.py | 33 +++++++++++++++++++++++++++++---- 2 files changed, 41 insertions(+), 4 deletions(-) create mode 100644 src/openpecha/config.py diff --git a/src/openpecha/config.py b/src/openpecha/config.py new file mode 100644 index 0000000..a55dc16 --- /dev/null +++ b/src/openpecha/config.py @@ -0,0 +1,12 @@ +from pathlib import Path + + +def _mkdir(path): + if path.is_dir(): + return path + path.mkdir(exist_ok=True, parents=True) + return path + + +BASE_PATH = _mkdir(Path.home() / ".pechadata") +PECHAS_PATH = _mkdir(BASE_PATH / "pechas") diff --git a/src/openpecha/pecha/__init__.py b/src/openpecha/pecha/__init__.py index bfd82b5..48c60ae 100644 --- a/src/openpecha/pecha/__init__.py +++ b/src/openpecha/pecha/__init__.py @@ -2,7 +2,7 @@ from stam import AnnotationStore, Offset, Selector -from openpecha.ids import get_uuid +from openpecha.config import PECHAS_PATH from openpecha.pecha.annotation import Annotation @@ -37,12 +37,30 @@ def set_annotations(self): char_count += len(segment) yield annotation + def create_pecha_folder(self): + pecha_dir = PECHAS_PATH.joinpath(self.pecha_id) + opf_dir = pecha_dir.joinpath(f"{self.pecha_id}.opf") + base_dir = opf_dir.joinpath("base") + layers_dir = opf_dir.joinpath("layers") + layer_id_dir = layers_dir.joinpath(self.pecha_id) + + pecha_dir.mkdir(exist_ok=True) + opf_dir.mkdir(exist_ok=True) + base_dir.mkdir(exist_ok=True) + base_dir.joinpath(f"{self.pecha_id}.txt").write_text(self.base_text) + layers_dir.mkdir(exist_ok=True) + layer_id_dir.mkdir(exist_ok=True) + + self.annotation_fn = layer_id_dir + self.base_fn = base_dir.joinpath(f"{self.pecha_id}.txt") + def write_annotations(self): + self.create_pecha_folder() """write annotations in stam data model""" self.annotation_store = AnnotationStore(id="PechaAnnotationStore") self.resource = self.annotation_store.add_resource( - id=self.pecha_id, filename="random file path" - ) # in case of having layers, resource_id will be pecha_id_layer_id + id=self.pecha_id, filename=self.base_fn.as_posix() + ) self.dataset = self.annotation_store.add_dataset(id="PechaDataSet") self.dataset.add_key(self.metadata["annotation_category"]) for annotation in self.annotations: @@ -58,8 +76,15 @@ def write_annotations(self): "set": self.dataset.id(), } ] - self.annotation_store.add_annotation( + self.annotation_store.annotate( id=annotation.id_, target=target, data=data, ) + """ save annotations in stam data model""" + self.annotation_store.set_filename( + self.annotation_fn.joinpath( + f"{self.metadata['annotation_label']}.json" + ).as_posix() + ) + self.annotation_store.save() From cac48fcd24a921bff4c24dee5332517a79a53912 Mon Sep 17 00:00:00 2001 From: Tenzin Date: Tue, 2 Jul 2024 11:38:11 +0530 Subject: [PATCH 09/34] fix/set annotation data id as unique --- src/openpecha/pecha/__init__.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/openpecha/pecha/__init__.py b/src/openpecha/pecha/__init__.py index 48c60ae..e3454b6 100644 --- a/src/openpecha/pecha/__init__.py +++ b/src/openpecha/pecha/__init__.py @@ -3,6 +3,7 @@ from stam import AnnotationStore, Offset, Selector from openpecha.config import PECHAS_PATH +from openpecha.ids import get_uuid from openpecha.pecha.annotation import Annotation @@ -63,6 +64,8 @@ def write_annotations(self): ) self.dataset = self.annotation_store.add_dataset(id="PechaDataSet") self.dataset.add_key(self.metadata["annotation_category"]) + + unique_annotation_data_id = get_uuid() for annotation in self.annotations: target = Selector.textselector( self.resource, @@ -70,7 +73,7 @@ def write_annotations(self): ) data = [ { - "id": annotation.id_, + "id": unique_annotation_data_id, "key": self.metadata["annotation_category"], "value": self.metadata["annotation_label"], "set": self.dataset.id(), From 0cd444ad64283dfa46387b8065ba6f229796c168 Mon Sep 17 00:00:00 2001 From: Tenzin Date: Tue, 2 Jul 2024 11:43:22 +0530 Subject: [PATCH 10/34] write metadata as json --- src/openpecha/pecha/__init__.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/openpecha/pecha/__init__.py b/src/openpecha/pecha/__init__.py index e3454b6..062eb1d 100644 --- a/src/openpecha/pecha/__init__.py +++ b/src/openpecha/pecha/__init__.py @@ -1,3 +1,4 @@ +import json from typing import Dict from stam import AnnotationStore, Offset, Selector @@ -41,12 +42,17 @@ def set_annotations(self): def create_pecha_folder(self): pecha_dir = PECHAS_PATH.joinpath(self.pecha_id) opf_dir = pecha_dir.joinpath(f"{self.pecha_id}.opf") + metadata_dir = opf_dir.joinpath("metadata.json") base_dir = opf_dir.joinpath("base") layers_dir = opf_dir.joinpath("layers") layer_id_dir = layers_dir.joinpath(self.pecha_id) pecha_dir.mkdir(exist_ok=True) opf_dir.mkdir(exist_ok=True) + metadata_dir.write_text( + json.dumps(self.metadata, indent=4, ensure_ascii=False), encoding="utf-8" + ) + base_dir.mkdir(exist_ok=True) base_dir.joinpath(f"{self.pecha_id}.txt").write_text(self.base_text) layers_dir.mkdir(exist_ok=True) From 4157fef62bf9b497397f6f425627457416eaa23a Mon Sep 17 00:00:00 2001 From: Tenzin Date: Tue, 2 Jul 2024 11:58:02 +0530 Subject: [PATCH 11/34] include base path to func write annotations --- src/openpecha/alignment/parsers/plaintext.py | 2 ++ src/openpecha/pecha/__init__.py | 16 +++++++++------- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/src/openpecha/alignment/parsers/plaintext.py b/src/openpecha/alignment/parsers/plaintext.py index 5243da2..ff214b6 100644 --- a/src/openpecha/alignment/parsers/plaintext.py +++ b/src/openpecha/alignment/parsers/plaintext.py @@ -34,6 +34,8 @@ def parse(self): target_pecha = Pecha( # noqa target_pecha_id, target_segments, self.metadata["target"] ) + source_pecha.write_annotations() + target_pecha.write_annotations() # TODO: diff --git a/src/openpecha/pecha/__init__.py b/src/openpecha/pecha/__init__.py index 062eb1d..5b22a85 100644 --- a/src/openpecha/pecha/__init__.py +++ b/src/openpecha/pecha/__init__.py @@ -1,4 +1,5 @@ import json +from pathlib import Path from typing import Dict from stam import AnnotationStore, Offset, Selector @@ -15,9 +16,6 @@ def __init__( self.pecha_id = pecha_id self.segments = segments self.metadata = metadata - self.base_text = "".join(segments.values()) - self.annotations = self.set_annotations() - self.write_annotations() @classmethod def from_path(cls, path: str): @@ -28,6 +26,7 @@ def from_id(cls, pecha_id: str): pass def set_annotations(self): + """set annotations for the segments""" char_count = 0 for segment_id, segment in self.segments.items(): annotation = Annotation( @@ -39,8 +38,8 @@ def set_annotations(self): char_count += len(segment) yield annotation - def create_pecha_folder(self): - pecha_dir = PECHAS_PATH.joinpath(self.pecha_id) + def create_pecha_folder(self, base_path: Path): + pecha_dir = base_path.joinpath(self.pecha_id) opf_dir = pecha_dir.joinpath(f"{self.pecha_id}.opf") metadata_dir = opf_dir.joinpath("metadata.json") base_dir = opf_dir.joinpath("base") @@ -61,8 +60,11 @@ def create_pecha_folder(self): self.annotation_fn = layer_id_dir self.base_fn = base_dir.joinpath(f"{self.pecha_id}.txt") - def write_annotations(self): - self.create_pecha_folder() + def write_annotations(self, base_path: Path = PECHAS_PATH): + self.base_text = "".join(self.segments.values()) + self.annotations = self.set_annotations() + + self.create_pecha_folder(base_path) """write annotations in stam data model""" self.annotation_store = AnnotationStore(id="PechaAnnotationStore") self.resource = self.annotation_store.add_resource( From 6420cd2cf89ecbdf88871047ed02f3352f75c54d Mon Sep 17 00:00:00 2001 From: Tenzin Date: Tue, 2 Jul 2024 12:09:17 +0530 Subject: [PATCH 12/34] include base path to PlainText parse --- src/openpecha/alignment/parsers/plaintext.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/openpecha/alignment/parsers/plaintext.py b/src/openpecha/alignment/parsers/plaintext.py index ff214b6..639839e 100644 --- a/src/openpecha/alignment/parsers/plaintext.py +++ b/src/openpecha/alignment/parsers/plaintext.py @@ -1,5 +1,6 @@ from pathlib import Path +from openpecha.config import PECHAS_PATH from openpecha.ids import get_initial_pecha_id, get_uuid from openpecha.pecha import Pecha @@ -16,7 +17,7 @@ def from_files(cls, source_path: Path, target_path: Path, metadata: dict): target_text = target_path.read_text(encoding="utf-8") return cls(source_text, target_text, metadata) - def parse(self): + def parse(self, base_path: Path = PECHAS_PATH): source_text_lines = self.source_text.split("\n") target_text_lines = self.target_text.split("\n") @@ -34,8 +35,8 @@ def parse(self): target_pecha = Pecha( # noqa target_pecha_id, target_segments, self.metadata["target"] ) - source_pecha.write_annotations() - target_pecha.write_annotations() + source_pecha.write_annotations(base_path) + target_pecha.write_annotations(base_path) # TODO: From b6b4c7a675679b316f26429544a1bfbaf79036d4 Mon Sep 17 00:00:00 2001 From: Tenzin Date: Tue, 2 Jul 2024 12:20:06 +0530 Subject: [PATCH 13/34] refactor code --- src/openpecha/alignment/parsers/plaintext.py | 22 ++++++++++++-------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/src/openpecha/alignment/parsers/plaintext.py b/src/openpecha/alignment/parsers/plaintext.py index 639839e..a632e47 100644 --- a/src/openpecha/alignment/parsers/plaintext.py +++ b/src/openpecha/alignment/parsers/plaintext.py @@ -17,23 +17,27 @@ def from_files(cls, source_path: Path, target_path: Path, metadata: dict): target_text = target_path.read_text(encoding="utf-8") return cls(source_text, target_text, metadata) - def parse(self, base_path: Path = PECHAS_PATH): + def parse(self): source_text_lines = self.source_text.split("\n") target_text_lines = self.target_text.split("\n") - """ prepare the data for pecha creation""" + self.source_segments = {get_uuid(): segment for segment in source_text_lines} + self.target_segments = {get_uuid(): segment for segment in target_text_lines} + + def save(self, base_path: Path = PECHAS_PATH): + if not self.source_segments or not self.target_segments: + self.parse() + + """ save the source and target pecha""" source_pecha_id, target_pecha_id = ( get_initial_pecha_id(), get_initial_pecha_id(), ) - source_segments = {get_uuid(): segment for segment in source_text_lines} - target_segments = {get_uuid(): segment for segment in target_text_lines} - - source_pecha = Pecha( # noqa - source_pecha_id, source_segments, self.metadata["source"] + source_pecha = Pecha( + source_pecha_id, self.source_segments, self.metadata["source"] ) - target_pecha = Pecha( # noqa - target_pecha_id, target_segments, self.metadata["target"] + target_pecha = Pecha( + target_pecha_id, self.target_segments, self.metadata["target"] ) source_pecha.write_annotations(base_path) target_pecha.write_annotations(base_path) From 053717c0ef2628dfe082648b6e408cfe2455807d Mon Sep 17 00:00:00 2001 From: Tenzin Date: Tue, 2 Jul 2024 12:46:54 +0530 Subject: [PATCH 14/34] fix/check if attribute exits --- src/openpecha/alignment/parsers/plaintext.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/openpecha/alignment/parsers/plaintext.py b/src/openpecha/alignment/parsers/plaintext.py index a632e47..2b62fac 100644 --- a/src/openpecha/alignment/parsers/plaintext.py +++ b/src/openpecha/alignment/parsers/plaintext.py @@ -25,7 +25,7 @@ def parse(self): self.target_segments = {get_uuid(): segment for segment in target_text_lines} def save(self, base_path: Path = PECHAS_PATH): - if not self.source_segments or not self.target_segments: + if not hasattr(self, "source_segments") or not hasattr(self, "target_segments"): self.parse() """ save the source and target pecha""" @@ -39,8 +39,7 @@ def save(self, base_path: Path = PECHAS_PATH): target_pecha = Pecha( target_pecha_id, self.target_segments, self.metadata["target"] ) - source_pecha.write_annotations(base_path) - target_pecha.write_annotations(base_path) + return source_pecha, target_pecha # TODO: From c3251b21994efdaa6f41d48d15a355b2a44a423c Mon Sep 17 00:00:00 2001 From: Tenzin Date: Tue, 2 Jul 2024 12:53:44 +0530 Subject: [PATCH 15/34] test for plaintext --- .pre-commit-config.yaml | 1 - src/openpecha/config.py | 2 - tests/{__init__.py => __init__.py} | 0 .../parsers/plaintext/data/comments.txt | 5 ++ .../parsers/plaintext/data/segments.txt | 5 ++ .../parsers/plaintext/test_plaintext.py | 56 +++++++++++++++++++ 6 files changed, 66 insertions(+), 3 deletions(-) rename tests/{__init__.py => __init__.py} (100%) create mode 100755 tests/alignment/parsers/plaintext/data/comments.txt create mode 100755 tests/alignment/parsers/plaintext/data/segments.txt create mode 100644 tests/alignment/parsers/plaintext/test_plaintext.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 55e4b03..a91a928 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -5,7 +5,6 @@ repos: rev: v4.3.0 hooks: - id: trailing-whitespace - - id: end-of-file-fixer - id: check-yaml - repo: https://github.com/asottile/pyupgrade diff --git a/src/openpecha/config.py b/src/openpecha/config.py index a55dc16..3123568 100644 --- a/src/openpecha/config.py +++ b/src/openpecha/config.py @@ -2,8 +2,6 @@ def _mkdir(path): - if path.is_dir(): - return path path.mkdir(exist_ok=True, parents=True) return path diff --git a/tests/__init__.py b/tests/__init__.py similarity index 100% rename from tests/__init__.py rename to tests/__init__.py diff --git a/tests/alignment/parsers/plaintext/data/comments.txt b/tests/alignment/parsers/plaintext/data/comments.txt new file mode 100755 index 0000000..bac8759 --- /dev/null +++ b/tests/alignment/parsers/plaintext/data/comments.txt @@ -0,0 +1,5 @@ +{D3874}༄༅༅། །རྒྱ་གར་སྐད་དུ། བོ་དྷི་སཏྭ་ཙཱརྱ་ཨ་བ་ཏཱ་ར་སང་ཀཱ་ར། +བོད་སྐད་དུ། བྱང་ཆུབ་སེམས་དཔའི་སྤྱོད་པ་ལ་འཇུག་པའི་ལེགས་པར་སྦྱར་བ། +བཅོམ་ལྡན་འདས་གསུང་གི་མངའ་བདག་འཇམ་དཔལ་གཞོན་ནུར་གྱུར་པ་ལ་ཕྱག་འཚལ་ལོ། །ངོ་བོ་ཉིད་ནི་བྱང་ཆུབ་སེམས་པའི། །རྒྱ་མཚོ་དེ་ལ་ཕྱག་འཚལ་ཏེ། །བདག་འདྲའི་སྤྱོད་པའི་ཡན་ལག་ལ། །འཇུག་ཕྱིར་ལེགས་སྦྱར་བཤད་ཙམ་བྱ། །དམ་པ་རྣམས་ཀྱིས་ནི་ཐོག་མ་དང་བར་དང་ཐ་མར་དགེ་བ་མངོན་པར་འཕེལ་བར་བྱ་བ་ཡིན་པས། བདེ་གཤེགས་ཞེས་བྱ་བ་ལ་སོགས་པ་སྨོས་པ་ཡིན་ཏེ། འདིར་ཕྱག་འཚལ་བ་ནི་དང་པོར་དགེ་བའོ། །ཆོས་བསྟན་པ་ནི་བར་དུ་དགེ་བའོ། །དགེ་བའི་རྩ་བ་ཡོངས་སུ་བསྔོ་བ་ནི་དགེ་བའི་རྩ་བ་མངོན་པར་འཕེལ་བ་ཡིན་པས་ཐ་མར་དགེ་བ་ཡིན་ནོ། །དེ་ལ་བདེ་བར་གཤེགས་པ་ནི་རྟོགས་པར་བྱ་བའི་ལྷག་མ་མི་མངའ་བས་ན་ཡོངས་སུ་རྫོགས་པར་ཐུགས་སུ་ཆུད་པའི་ཕྱིར་བདེ་བར་གཤེགས་པའོ། །ཆོས་ཀྱི་སྐུ་མངའ་བ་ནི་ལུང་དང་རྟོགས་པའི་བདག་ཉིད་ཅན་གྱི་དམ་པའི་ཆོས་ཀྱི་ཚོགས་ནི་ཆོས་ཀྱི་སྐུ་སྟེ་དེ་དང་བཅས་པའོ། །སྲས་བཅས་ནི་ཉིད་ལས་འཁྲུངས་པའི་སྲས་ཏེ། བྱང་ཆུབ་སེམས་དཔའ་དང་བཅས་པའོ། །ལ་ལ་ལས་ནི་བདེ་གཤེགས་དམ་པའི་ཆོས་དང་དགེ་འདུན་བཅས་ཞེས་ཟེར་རོ། །བཙུན་པ་ནི་ཉན་ཐོས་ཆེན་པོ་བརྒྱད་ལ་སོགས་པ་ལ་བྱ་སྟེ། དེ་དག་མ་ལུས་པ་ཀུན་ལ་ཕྱག་འཚལ་བའོ། །དཀོན་མཆོག་གསུམ་པོ་གཙོ་བོར་གྱུར་པས་སོ་སོར་སྨོས་པ་ཡིན་ལ། དེ་དག་ཀྱང་ཕྱག་བྱ་བར་འོས་པ་ཡིན་པས་གུས་པས་ཕྱག་འཚལ་ཏེ། ཞེས་བྱ་བ་སྨོས་ཏེ། འདིར་ཡོན་ཏན་དམ་པའི་བསྟོད་པ་རྒྱ་ཆེ་བ་དང་། མཆོད་པ་ཁྱད་པར་དུ་འཕགས་པའི་དམིགས་པ་ཡིད་ལ་བྱེད་པ་ལས་བྱུང་བའི་མོས་པའི་བསམ་པ་ཤིན་ཏུ་ཕུལ་དུ་བྱུང་བའི་དགའ་བ་རྒྱ་ཆེ་བའི་མཆོད་པ་དང་བཅས་པས་ལུས་ཞིང་ཐམས་ཅད་ཀྱི་རྡུལ་སྙེད་ཀྱིས་བཏུད་ཅིང་ཕྱག་འཚལ་ལོ། །དེ་ལྟར་ཕྱག་བཙལ་ནས་ཅི་ཞིག་བྱེད་ཅེ་ན། བདེ་གཤེགས་སྲས་ཀྱི་ཞེས་བྱ་བ་ལ་སོགས་པ་སྨོས་ཏེ། བདེ་བར་གཤེགས་པའི་བདག་ཉིད་ནི་ཆོས་ཀྱི་སྐུ་སྟེ། དེའི་དབང་དུ་བྱས་པ་ལས་སྐྱེས་པ་ས་ཆེན་པོ་ཐོབ་པ་དང་། རྒྱུ་ལ་གནས་པ་རྣམས་སོ། །དེ་རྣམས་ཀྱི་སྡོམ་པ་ནི་མི་དགེ་བ་སྤོང་བ་དང་། དགེ་བ་ལ་འཇུག་པ་དང་། སེམས་ཅན་གྱི་དོན་བྱ་བའོ། །དེ་ཡང་བཅོམ་ལྡན་འདས་ཀྱིས་ཤིན་ཏུ་ཟབ་ཅིང་རྒྱ་ཆེ་བའི་བདག་ཉིད་ཅན་དུ་གསུངས་ལ། དེར་བྱང་ཆུབ་ཏུ་སེམས་བསྐྱེད་པའི་ཕན་ཡོན་ལ་སོགས་པའི་དོན་རྣམ་པ་བཅུ་པོ་གང་ཡིན་པ་དེས་འཇུག་པའི་བདེ་བར་གཤེགས་པའི་སྲས་ཀྱི་སྡོམ་པ་ལ་འཇུག་པ་བསྟན་པར་བྱའོ། །དེ་ཡང་ལུང་བཞིན་ཞེས་བྱ་བ་སྟེ། ལུང་གི་དོན་དང་མི་འགལ་བར་རོ། །ལུང་ལས་ནི་བཅོམ་ལྡན་འདས་ཀྱིས་རྒྱ་ཆེར་གསུངས་སོ་ཞེ་ན། མདོར་བསྡུས་ནས་ནི་ཞེས་བྱ་བ་སྨོས་སོ། །དེ་ལྟ་ཡིན་དུ་ཆུག་ན། ཅི་འདིར་སྔོན་ཆད་མ་བྱུང་བ་གཞན་འགའ་ཞིག་སྨས་སམ། + +ལུང་ཇི་ལྟ་བ་ཡིན་ཞེ་ན། སྔོན་ཆད་ཅེས་བྱ་བ་ལ་སོགས་པ་སྨོས་སོ། །སྡེབ་སྦྱོར་མཁས་པས་སྔོན་མ་ཡིན་ནམ་ཞེ་ན། སྡེབ་སྦྱོར་ཞེས་བྱ་བ་ལ་སོགས་པ་སྨོས་སོ། །གང་གི་ཕྱིར་འདིར་སྡེབ་སྦྱོར་ལ་མཁས་པ་མེད་པ་ཉིད་ཀྱི་ཕྱིར་གཞན་གྱི་དོན་དུ་བདག་གིས་འདི་གཞུང་དུ་ཉེ་བར་སྦྱར་བ་མ་བྱས་སོ་ཞེས་བྱ་བར་དགོངས་སོ། །དེ་ལྟར་གལ་ཏེ་གཞན་གྱི་དོན་དུ་མ་བྱས་ན་ཅིའི་ཕྱིར་བྱེད་ཅེ་ན། དེའི་ཕྱིར་རང་གི་ཞེས་བྱ་བ་ལ་སོགས་པ་སྨོས་ཏེ། ཡིད་ལ་འདིར་བྱང་ཆུབ་ཀྱི་སེམས་ཏེ། དེ་བསྒོམ་པའི་ཕྱིར་ཞེས་བྱ་བ་ནི་བསླབ་པའི་ཕྱིར་རོ། །བཅོམ་ལྡན་འདས་ཀྱིས་ལུང་ལས་ཚིག་གི་དོན་རྒྱ་ཆེར་གསུངས་པ་དེ་ལས་མདོར་བསྡུས་ཏེ་རང་གི་ཡིད་ལ་བསྒོམ་པར་བྱ་བའི་ཕྱིར་བདག་གིས་འདི་བྱས་སོ་ཞེས་པའོ། ། \ No newline at end of file diff --git a/tests/alignment/parsers/plaintext/data/segments.txt b/tests/alignment/parsers/plaintext/data/segments.txt new file mode 100755 index 0000000..1c756a1 --- /dev/null +++ b/tests/alignment/parsers/plaintext/data/segments.txt @@ -0,0 +1,5 @@ +རྒྱ་གར་སྐད་དུ། བོ་དྷི་སཏྭ་ཙརྱ་ཨ་བ་ཏཱ་ར། +བོད་སྐད་དུ། བྱང་ཆུབ་སེམས་དཔའི་སྤྱོད་པ་ལ་འཇུག་པ། +སངས་རྒྱས་དང་བྱང་ཆུབ་སེམས་དཔའ་ཐམས་ཅད་ལ་ཕྱག་འཚལ་ལོ། ། +བདེ་གཤེགས་ཆོས་ཀྱི་སྐུ་མངའ་སྲས་བཅས་དང་། །ཕྱག་འོས་ཀུན་ལའང་གུས་པར་ཕྱག་འཚལ་ཏེ། །བདེ་གཤེགས་སྲས་ཀྱི་སྡོམ་ལ་འཇུག་པ་ནི། །ལུང་བཞིན་མདོར་བསྡུས་ནས་ནི་བརྗོད་པར་བྱ། ། +སྔོན་ཆད་མ་བྱུང་བ་ཡང་འདིར་བརྗོད་མེད། །སྡེབ་སྦྱོར་མཁས་པའང་བདག་ལ་ཡོད་མིན་ཏེ། །དེ་ཕྱིར་གཞན་དོན་བསམ་པ་བདག་ལ་མེད། །རང་གི་ཡིད་ལ་བསྒོམ་ཕྱིར་ངས་འདི་བརྩམས། ། \ No newline at end of file diff --git a/tests/alignment/parsers/plaintext/test_plaintext.py b/tests/alignment/parsers/plaintext/test_plaintext.py new file mode 100644 index 0000000..ca44bce --- /dev/null +++ b/tests/alignment/parsers/plaintext/test_plaintext.py @@ -0,0 +1,56 @@ +from pathlib import Path + +from openpecha.alignment.parsers.plaintext import PlainText +from openpecha.pecha import Pecha + + +def test_plaintext_parse(): + DATA_DIR = Path(__file__).parent / "data" + source_path = DATA_DIR / "segments.txt" + target_path = DATA_DIR / "comments.txt" + + metadata = { + "source": { + "annotation_category": "Structure Type", + "annotation_label": "Segment", + }, + "target": { + "annotation_category": "Structure Type", + "annotation_label": "Comment", + }, + } + plaintext = PlainText.from_files(source_path, target_path, metadata) + plaintext.parse() + + assert ( + len(plaintext.source_segments) == 5 + ), "plaintext parser is not parsing source_segments correctly" + assert ( + len(plaintext.target_segments) == 5 + ), "plaintext parser is not parsing target_segments correctly" + + +def test_plaintext_save(): + DATA_DIR = Path(__file__).parent / "data" + source_path = DATA_DIR / "segments.txt" + target_path = DATA_DIR / "comments.txt" + + metadata = { + "source": { + "annotation_category": "Structure Type", + "annotation_label": "Segment", + }, + "target": { + "annotation_category": "Structure Type", + "annotation_label": "Comment", + }, + } + plaintext = PlainText.from_files(source_path, target_path, metadata) + source_pecha, target_pecha = plaintext.save() + + assert isinstance( + source_pecha, Pecha + ), "plaintext parser is not saving source_pecha as an instance of Pecha" + assert isinstance( + target_pecha, Pecha + ), "plaintext parser is not saving target_pecha as an instance of Pecha" From 6f773afd9945913959cb6e7365c7ea8f53a1df35 Mon Sep 17 00:00:00 2001 From: Tenzin Date: Tue, 2 Jul 2024 12:57:52 +0530 Subject: [PATCH 16/34] refactor test plaintext --- .../parsers/plaintext/test_plaintext.py | 35 +++++++++---------- 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/tests/alignment/parsers/plaintext/test_plaintext.py b/tests/alignment/parsers/plaintext/test_plaintext.py index ca44bce..488cd31 100644 --- a/tests/alignment/parsers/plaintext/test_plaintext.py +++ b/tests/alignment/parsers/plaintext/test_plaintext.py @@ -4,12 +4,12 @@ from openpecha.pecha import Pecha -def test_plaintext_parse(): - DATA_DIR = Path(__file__).parent / "data" - source_path = DATA_DIR / "segments.txt" - target_path = DATA_DIR / "comments.txt" +def get_data_dir(): + return Path(__file__).parent / "data" - metadata = { + +def get_metadata(): + return { "source": { "annotation_category": "Structure Type", "annotation_label": "Segment", @@ -19,6 +19,14 @@ def test_plaintext_parse(): "annotation_label": "Comment", }, } + + +def test_plaintext_parse(): + DATA_DIR = get_data_dir() + source_path = DATA_DIR / "segments.txt" + target_path = DATA_DIR / "comments.txt" + + metadata = get_metadata() plaintext = PlainText.from_files(source_path, target_path, metadata) plaintext.parse() @@ -31,26 +39,17 @@ def test_plaintext_parse(): def test_plaintext_save(): - DATA_DIR = Path(__file__).parent / "data" + DATA_DIR = get_data_dir() source_path = DATA_DIR / "segments.txt" target_path = DATA_DIR / "comments.txt" - metadata = { - "source": { - "annotation_category": "Structure Type", - "annotation_label": "Segment", - }, - "target": { - "annotation_category": "Structure Type", - "annotation_label": "Comment", - }, - } + metadata = get_metadata() plaintext = PlainText.from_files(source_path, target_path, metadata) source_pecha, target_pecha = plaintext.save() assert isinstance( source_pecha, Pecha - ), "plaintext parser is not saving source_pecha as an instance of Pecha" + ), f"source_pecha is not an instance of Pecha, but {type(source_pecha)}" assert isinstance( target_pecha, Pecha - ), "plaintext parser is not saving target_pecha as an instance of Pecha" + ), f"target_pecha is not an instance of Pecha, but {type(target_pecha)}" From 216b4fb580e82b5050dc149bd0ffdb57f52cf91a Mon Sep 17 00:00:00 2001 From: Tenzin Date: Tue, 2 Jul 2024 13:21:45 +0530 Subject: [PATCH 17/34] fix/check if annotations exits before setting --- src/openpecha/pecha/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/openpecha/pecha/__init__.py b/src/openpecha/pecha/__init__.py index 5b22a85..71f2d3d 100644 --- a/src/openpecha/pecha/__init__.py +++ b/src/openpecha/pecha/__init__.py @@ -61,8 +61,10 @@ def create_pecha_folder(self, base_path: Path): self.base_fn = base_dir.joinpath(f"{self.pecha_id}.txt") def write_annotations(self, base_path: Path = PECHAS_PATH): + if not hasattr(self, "annotations"): + self.annotations = self.set_annotations() + self.base_text = "".join(self.segments.values()) - self.annotations = self.set_annotations() self.create_pecha_folder(base_path) """write annotations in stam data model""" From 761ac7884b5308ac4704816e2f327ce17a82dd22 Mon Sep 17 00:00:00 2001 From: Tenzin Date: Tue, 2 Jul 2024 13:22:52 +0530 Subject: [PATCH 18/34] test for Pecha --- tests/pecha/test_pecha.py | 51 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 tests/pecha/test_pecha.py diff --git a/tests/pecha/test_pecha.py b/tests/pecha/test_pecha.py new file mode 100644 index 0000000..81137f6 --- /dev/null +++ b/tests/pecha/test_pecha.py @@ -0,0 +1,51 @@ +from openpecha.pecha import Pecha +from openpecha.pecha.annotation import Annotation + + +def get_segments(): + return { + "f2b056668a0c4ad3a085bdcd8e2d7adb": "རྒྱ་གར་སྐད་དུ། བོ་དྷི་སཏྭ་ཙརྱ་ཨ་བ་ཏཱ་ར།", + "b696df2dbe314e8a87881a2bc391d0d5": "བོད་སྐད་དུ། བྱང་ཆུབ་སེམས་དཔའི་སྤྱོད་པ་ལ་འཇུག་པའི་ལེགས་པར་སྦྱར་བ།", + } + + +def get_metadata(): + return { + "annotation_category": "Structure Type", + "annotation_label": "Segment", + } + + +def get_expected_annotations(): + expected_annotations = [ + Annotation( + id_="f2b056668a0c4ad3a085bdcd8e2d7adb", + segment="རྒྱ་གར་སྐད་དུ། བོ་དྷི་སཏྭ་ཙརྱ་ཨ་བ་ཏཱ་ར།", + start=0, + end=39, + metadata={}, + ), + Annotation( + id_="b696df2dbe314e8a87881a2bc391d0d5", + segment="བོད་སྐད་དུ། བྱང་ཆུབ་སེམས་དཔའི་སྤྱོད་པ་ལ་འཇུག་པའི་ལེགས་པར་སྦྱར་བ།", + start=39, + end=103, + metadata={}, + ), + ] + return expected_annotations + + +def test_pecha_set_annotations(): + pecha_id = "IE7D6875F" + segments = get_segments() + metadata = get_metadata() + pecha = Pecha(pecha_id=pecha_id, segments=segments, metadata=metadata) + assert isinstance( + pecha, Pecha + ), "Not able to create Pecha object with id, segments and metadata" + + annotations = list(pecha.set_annotations()) + assert ( + annotations == get_expected_annotations() + ), "Pecha not able to set annotations for the segments" From 047b3ac9e1bf9f2ac0c44fbde8b5cffbb48c6eaf Mon Sep 17 00:00:00 2001 From: Tenzin Date: Tue, 2 Jul 2024 13:26:43 +0530 Subject: [PATCH 19/34] test for ids --- tests/test_ids.py | 78 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 tests/test_ids.py diff --git a/tests/test_ids.py b/tests/test_ids.py new file mode 100644 index 0000000..39f6913 --- /dev/null +++ b/tests/test_ids.py @@ -0,0 +1,78 @@ +import re + +from openpecha.ids import ( + get_alignment_id, + get_base_id, + get_collection_id, + get_diplomatic_id, + get_id, + get_initial_pecha_id, + get_open_pecha_id, + get_uuid, + get_work_id, +) + + +def test_get_uuid(): + uuid = get_uuid() + assert re.match( + r"^[0-9a-fA-F]{32}$", uuid + ), f"UUID {uuid} is not in the correct format" + + +def test_get_id(): + prefix = "T" + length = 4 + generated_id = get_id(prefix, length) + assert re.match( + r"^T[0-9A-F]{4}$", generated_id + ), f"ID {generated_id} is not in the correct format" + + +def test_get_base_id(): + base_id = get_base_id() + assert re.match( + r"^[0-9A-F]{4}$", base_id + ), f"Base ID {base_id} is not in the correct format" + + +def test_get_initial_pecha_id(): + initial_pecha_id = get_initial_pecha_id() + assert re.match( + r"^I[0-9A-F]{8}$", initial_pecha_id + ), f"Initial Pecha ID {initial_pecha_id} is not in the correct format" + + +def test_get_open_pecha_id(): + open_pecha_id = get_open_pecha_id() + assert re.match( + r"^O[0-9A-F]{8}$", open_pecha_id + ), f"Open Pecha ID {open_pecha_id} is not in the correct format" + + +def test_get_diplomatic_id(): + diplomatic_id = get_diplomatic_id() + assert re.match( + r"^D[0-9A-F]{8}$", diplomatic_id + ), f"Diplomatic ID {diplomatic_id} is not in the correct format" + + +def test_get_work_id(): + work_id = get_work_id() + assert re.match( + r"^W[0-9A-F]{8}$", work_id + ), f"Work ID {work_id} is not in the correct format" + + +def test_get_alignment_id(): + alignment_id = get_alignment_id() + assert re.match( + r"^A[0-9A-F]{8}$", alignment_id + ), f"Alignment ID {alignment_id} is not in the correct format" + + +def test_get_collection_id(): + collection_id = get_collection_id() + assert re.match( + r"^C[0-9A-F]{8}$", collection_id + ), f"Collection ID {collection_id} is not in the correct format" From 4bd398246b01a90a9fa54d032e9bc8dbbb454eca Mon Sep 17 00:00:00 2001 From: Tenzin Date: Tue, 2 Jul 2024 16:28:12 +0530 Subject: [PATCH 20/34] modify/convert relative path in json string --- src/openpecha/pecha/__init__.py | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/src/openpecha/pecha/__init__.py b/src/openpecha/pecha/__init__.py index 71f2d3d..64cd4ff 100644 --- a/src/openpecha/pecha/__init__.py +++ b/src/openpecha/pecha/__init__.py @@ -41,14 +41,14 @@ def set_annotations(self): def create_pecha_folder(self, base_path: Path): pecha_dir = base_path.joinpath(self.pecha_id) opf_dir = pecha_dir.joinpath(f"{self.pecha_id}.opf") - metadata_dir = opf_dir.joinpath("metadata.json") + metadata_fn = opf_dir.joinpath("metadata.json") base_dir = opf_dir.joinpath("base") layers_dir = opf_dir.joinpath("layers") layer_id_dir = layers_dir.joinpath(self.pecha_id) pecha_dir.mkdir(exist_ok=True) opf_dir.mkdir(exist_ok=True) - metadata_dir.write_text( + metadata_fn.write_text( json.dumps(self.metadata, indent=4, ensure_ascii=False), encoding="utf-8" ) @@ -59,6 +59,16 @@ def create_pecha_folder(self, base_path: Path): self.annotation_fn = layer_id_dir self.base_fn = base_dir.joinpath(f"{self.pecha_id}.txt") + self.opf_fn = base_path + self.metadata_fn = metadata_fn + + def covert_to_relative_path(self, json_string: str): + """convert the absolute path to relative path for base file path in json string""" + json_object = json.loads(json_string) + for resource in json_object["resources"]: + original_path = Path(resource["@include"]) + resource["@include"] = str(original_path.relative_to(self.opf_fn)) + return json_object def write_annotations(self, base_path: Path = PECHAS_PATH): if not hasattr(self, "annotations"): @@ -95,9 +105,10 @@ def write_annotations(self, base_path: Path = PECHAS_PATH): data=data, ) """ save annotations in stam data model""" - self.annotation_store.set_filename( - self.annotation_fn.joinpath( - f"{self.metadata['annotation_label']}.json" - ).as_posix() - ) - self.annotation_store.save() + json_string = self.annotation_store.to_json_string() + json_object = self.covert_to_relative_path(json_string) + with open( + self.annotation_fn.joinpath(f"{self.metadata['annotation_label']}.json"), + "w", + ) as f: + f.write(json.dumps(json_object, indent=4, ensure_ascii=False)) From b46fdd033ab27dee86247e14a345220d75910e8e Mon Sep 17 00:00:00 2001 From: Tenzin Date: Tue, 2 Jul 2024 16:28:57 +0530 Subject: [PATCH 21/34] test case for Pecha write annotations --- tests/pecha/test_pecha.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/tests/pecha/test_pecha.py b/tests/pecha/test_pecha.py index 81137f6..4f53a43 100644 --- a/tests/pecha/test_pecha.py +++ b/tests/pecha/test_pecha.py @@ -1,7 +1,16 @@ +from pathlib import Path +from shutil import rmtree + from openpecha.pecha import Pecha from openpecha.pecha.annotation import Annotation +def get_data_dir(): + base_path = Path(__file__).parent / "data" + base_path.mkdir(parents=True, exist_ok=True) + return base_path + + def get_segments(): return { "f2b056668a0c4ad3a085bdcd8e2d7adb": "རྒྱ་གར་སྐད་དུ། བོ་དྷི་སཏྭ་ཙརྱ་ཨ་བ་ཏཱ་ར།", @@ -49,3 +58,20 @@ def test_pecha_set_annotations(): assert ( annotations == get_expected_annotations() ), "Pecha not able to set annotations for the segments" + + +def test_pecha_write_annotations(): + pecha_id = "IE7D6875F" + segments = get_segments() + metadata = get_metadata() + pecha = Pecha(pecha_id=pecha_id, segments=segments, metadata=metadata) + base_path = get_data_dir() + pecha.write_annotations(base_path=base_path) + assert pecha.base_fn.exists(), "Pecha not able to write base file" + assert pecha.metadata_fn.exists(), "Pecha not able to write metadata file" + assert pecha.annotation_fn.rglob( + "*.json" + ), "Pecha not able to write annotation file" + + """ clean up """ + rmtree(Path(base_path / pecha_id)) From 8872295647b3df1267656705aca6da971cc0b538 Mon Sep 17 00:00:00 2001 From: Tenzin Date: Tue, 2 Jul 2024 17:02:46 +0530 Subject: [PATCH 22/34] refactor/Pecha create pecha folder --- .gitignore | 1 + src/openpecha/config.py | 2 ++ src/openpecha/pecha/__init__.py | 41 +++++++++++++++------------------ tests/pecha/test_pecha.py | 15 +++++++----- 4 files changed, 30 insertions(+), 29 deletions(-) diff --git a/.gitignore b/.gitignore index b6e4761..cdf689f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +pechas/ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/src/openpecha/config.py b/src/openpecha/config.py index 3123568..a55dc16 100644 --- a/src/openpecha/config.py +++ b/src/openpecha/config.py @@ -2,6 +2,8 @@ def _mkdir(path): + if path.is_dir(): + return path path.mkdir(exist_ok=True, parents=True) return path diff --git a/src/openpecha/pecha/__init__.py b/src/openpecha/pecha/__init__.py index 64cd4ff..bcccd65 100644 --- a/src/openpecha/pecha/__init__.py +++ b/src/openpecha/pecha/__init__.py @@ -4,7 +4,7 @@ from stam import AnnotationStore, Offset, Selector -from openpecha.config import PECHAS_PATH +from openpecha.config import PECHAS_PATH, _mkdir from openpecha.ids import get_uuid from openpecha.pecha.annotation import Annotation @@ -38,45 +38,40 @@ def set_annotations(self): char_count += len(segment) yield annotation - def create_pecha_folder(self, base_path: Path): - pecha_dir = base_path.joinpath(self.pecha_id) - opf_dir = pecha_dir.joinpath(f"{self.pecha_id}.opf") - metadata_fn = opf_dir.joinpath("metadata.json") - base_dir = opf_dir.joinpath("base") - layers_dir = opf_dir.joinpath("layers") - layer_id_dir = layers_dir.joinpath(self.pecha_id) - - pecha_dir.mkdir(exist_ok=True) - opf_dir.mkdir(exist_ok=True) - metadata_fn.write_text( + def create_pecha_folder(self, export_path: Path): + self.export_path = export_path + + pecha_dir = _mkdir(export_path.joinpath(self.pecha_id)) + opf_dir = _mkdir(pecha_dir.joinpath(f"{self.pecha_id}.opf")) + self.metadata_fn = opf_dir.joinpath("metadata.json") + base_dir = _mkdir(opf_dir.joinpath("base")) + layers_dir = _mkdir(opf_dir.joinpath("layers")) + layer_id_dir = _mkdir(layers_dir.joinpath(self.pecha_id)) + + """ write metadata and base file""" + self.metadata_fn.write_text( json.dumps(self.metadata, indent=4, ensure_ascii=False), encoding="utf-8" ) - - base_dir.mkdir(exist_ok=True) - base_dir.joinpath(f"{self.pecha_id}.txt").write_text(self.base_text) - layers_dir.mkdir(exist_ok=True) - layer_id_dir.mkdir(exist_ok=True) + self.base_fn = Path(base_dir / f"{self.pecha_id}.txt") + self.base_fn.write_text(self.base_text) self.annotation_fn = layer_id_dir - self.base_fn = base_dir.joinpath(f"{self.pecha_id}.txt") - self.opf_fn = base_path - self.metadata_fn = metadata_fn def covert_to_relative_path(self, json_string: str): """convert the absolute path to relative path for base file path in json string""" json_object = json.loads(json_string) for resource in json_object["resources"]: original_path = Path(resource["@include"]) - resource["@include"] = str(original_path.relative_to(self.opf_fn)) + resource["@include"] = str(original_path.relative_to(self.export_path)) return json_object - def write_annotations(self, base_path: Path = PECHAS_PATH): + def write_annotations(self, export_path: Path = PECHAS_PATH): if not hasattr(self, "annotations"): self.annotations = self.set_annotations() self.base_text = "".join(self.segments.values()) - self.create_pecha_folder(base_path) + self.create_pecha_folder(export_path) """write annotations in stam data model""" self.annotation_store = AnnotationStore(id="PechaAnnotationStore") self.resource = self.annotation_store.add_resource( diff --git a/tests/pecha/test_pecha.py b/tests/pecha/test_pecha.py index 4f53a43..fd09a39 100644 --- a/tests/pecha/test_pecha.py +++ b/tests/pecha/test_pecha.py @@ -6,9 +6,9 @@ def get_data_dir(): - base_path = Path(__file__).parent / "data" - base_path.mkdir(parents=True, exist_ok=True) - return base_path + export_path = Path(__file__).parent / "data" + export_path.mkdir(parents=True, exist_ok=True) + return export_path def get_segments(): @@ -65,8 +65,8 @@ def test_pecha_write_annotations(): segments = get_segments() metadata = get_metadata() pecha = Pecha(pecha_id=pecha_id, segments=segments, metadata=metadata) - base_path = get_data_dir() - pecha.write_annotations(base_path=base_path) + export_path = get_data_dir() + pecha.write_annotations(export_path=export_path) assert pecha.base_fn.exists(), "Pecha not able to write base file" assert pecha.metadata_fn.exists(), "Pecha not able to write metadata file" assert pecha.annotation_fn.rglob( @@ -74,4 +74,7 @@ def test_pecha_write_annotations(): ), "Pecha not able to write annotation file" """ clean up """ - rmtree(Path(base_path / pecha_id)) + rmtree(Path(export_path / pecha_id)) + + +test_pecha_write_annotations() From 1c31178ef5635e038609a42dba9565aef27527a2 Mon Sep 17 00:00:00 2001 From: Tenzin Date: Tue, 2 Jul 2024 17:03:55 +0530 Subject: [PATCH 23/34] refactor --- src/openpecha/pecha/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/openpecha/pecha/__init__.py b/src/openpecha/pecha/__init__.py index bcccd65..4e25c8d 100644 --- a/src/openpecha/pecha/__init__.py +++ b/src/openpecha/pecha/__init__.py @@ -43,12 +43,12 @@ def create_pecha_folder(self, export_path: Path): pecha_dir = _mkdir(export_path.joinpath(self.pecha_id)) opf_dir = _mkdir(pecha_dir.joinpath(f"{self.pecha_id}.opf")) - self.metadata_fn = opf_dir.joinpath("metadata.json") base_dir = _mkdir(opf_dir.joinpath("base")) layers_dir = _mkdir(opf_dir.joinpath("layers")) layer_id_dir = _mkdir(layers_dir.joinpath(self.pecha_id)) """ write metadata and base file""" + self.metadata_fn = opf_dir.joinpath("metadata.json") self.metadata_fn.write_text( json.dumps(self.metadata, indent=4, ensure_ascii=False), encoding="utf-8" ) From edf86cae29e186b6e3187ec1133960a54935d48b Mon Sep 17 00:00:00 2001 From: Tenzin Date: Tue, 2 Jul 2024 17:08:19 +0530 Subject: [PATCH 24/34] delete/ unneccessary line --- tests/pecha/test_pecha.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/pecha/test_pecha.py b/tests/pecha/test_pecha.py index fd09a39..47f8e18 100644 --- a/tests/pecha/test_pecha.py +++ b/tests/pecha/test_pecha.py @@ -75,6 +75,3 @@ def test_pecha_write_annotations(): """ clean up """ rmtree(Path(export_path / pecha_id)) - - -test_pecha_write_annotations() From ba87b8f3fad5a871f9c002f43ce1fa679c778a0c Mon Sep 17 00:00:00 2001 From: Tenzin Date: Wed, 3 Jul 2024 10:23:00 +0530 Subject: [PATCH 25/34] set ANNOTATION_STORE_ID and ANNOTATION_DATASET_ID in config --- src/openpecha/config.py | 3 +++ src/openpecha/pecha/__init__.py | 11 ++++++++--- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/src/openpecha/config.py b/src/openpecha/config.py index a55dc16..d436983 100644 --- a/src/openpecha/config.py +++ b/src/openpecha/config.py @@ -10,3 +10,6 @@ def _mkdir(path): BASE_PATH = _mkdir(Path.home() / ".pechadata") PECHAS_PATH = _mkdir(BASE_PATH / "pechas") + +PECHA_ANNOTATION_STORE_ID = "PechaAnnotationStore" +PECHA_DATASET_ID = "PechaDataSet" diff --git a/src/openpecha/pecha/__init__.py b/src/openpecha/pecha/__init__.py index 4e25c8d..436059f 100644 --- a/src/openpecha/pecha/__init__.py +++ b/src/openpecha/pecha/__init__.py @@ -4,7 +4,12 @@ from stam import AnnotationStore, Offset, Selector -from openpecha.config import PECHAS_PATH, _mkdir +from openpecha.config import ( + PECHA_ANNOTATION_STORE_ID, + PECHA_DATASET_ID, + PECHAS_PATH, + _mkdir, +) from openpecha.ids import get_uuid from openpecha.pecha.annotation import Annotation @@ -73,11 +78,11 @@ def write_annotations(self, export_path: Path = PECHAS_PATH): self.create_pecha_folder(export_path) """write annotations in stam data model""" - self.annotation_store = AnnotationStore(id="PechaAnnotationStore") + self.annotation_store = AnnotationStore(id=PECHA_ANNOTATION_STORE_ID) self.resource = self.annotation_store.add_resource( id=self.pecha_id, filename=self.base_fn.as_posix() ) - self.dataset = self.annotation_store.add_dataset(id="PechaDataSet") + self.dataset = self.annotation_store.add_dataset(id=PECHA_DATASET_ID) self.dataset.add_key(self.metadata["annotation_category"]) unique_annotation_data_id = get_uuid() From a886053f77b66d9e171727a725fbb541d0fe2a55 Mon Sep 17 00:00:00 2001 From: Tenzin Date: Wed, 3 Jul 2024 11:06:46 +0530 Subject: [PATCH 26/34] set base file name as uuid --- src/openpecha/config.py | 5 +++-- src/openpecha/pecha/__init__.py | 15 +++++++++------ 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/src/openpecha/config.py b/src/openpecha/config.py index d436983..e0fa952 100644 --- a/src/openpecha/config.py +++ b/src/openpecha/config.py @@ -1,9 +1,10 @@ from pathlib import Path +from shutil import rmtree def _mkdir(path): - if path.is_dir(): - return path + if path.exists(): + rmtree(path) path.mkdir(exist_ok=True, parents=True) return path diff --git a/src/openpecha/pecha/__init__.py b/src/openpecha/pecha/__init__.py index 436059f..25a0cb4 100644 --- a/src/openpecha/pecha/__init__.py +++ b/src/openpecha/pecha/__init__.py @@ -1,5 +1,6 @@ import json from pathlib import Path +from shutil import rmtree from typing import Dict from stam import AnnotationStore, Offset, Selector @@ -48,18 +49,20 @@ def create_pecha_folder(self, export_path: Path): pecha_dir = _mkdir(export_path.joinpath(self.pecha_id)) opf_dir = _mkdir(pecha_dir.joinpath(f"{self.pecha_id}.opf")) - base_dir = _mkdir(opf_dir.joinpath("base")) - layers_dir = _mkdir(opf_dir.joinpath("layers")) - layer_id_dir = _mkdir(layers_dir.joinpath(self.pecha_id)) - """ write metadata and base file""" self.metadata_fn = opf_dir.joinpath("metadata.json") self.metadata_fn.write_text( json.dumps(self.metadata, indent=4, ensure_ascii=False), encoding="utf-8" ) - self.base_fn = Path(base_dir / f"{self.pecha_id}.txt") + + base_file_name = get_uuid() + base_dir = _mkdir(opf_dir.joinpath("base")) + self.base_fn = Path(base_dir / f"{base_file_name}.txt") self.base_fn.write_text(self.base_text) + layers_dir = _mkdir(opf_dir.joinpath("layers")) + layer_id_dir = _mkdir(layers_dir.joinpath(base_file_name)) + self.annotation_fn = layer_id_dir def covert_to_relative_path(self, json_string: str): @@ -80,7 +83,7 @@ def write_annotations(self, export_path: Path = PECHAS_PATH): """write annotations in stam data model""" self.annotation_store = AnnotationStore(id=PECHA_ANNOTATION_STORE_ID) self.resource = self.annotation_store.add_resource( - id=self.pecha_id, filename=self.base_fn.as_posix() + id=self.base_fn.name, filename=self.base_fn.as_posix() ) self.dataset = self.annotation_store.add_dataset(id=PECHA_DATASET_ID) self.dataset.add_key(self.metadata["annotation_category"]) From a939d5df7607e965191abc6bcac3a92e81c7c2b8 Mon Sep 17 00:00:00 2001 From: Tenzin Date: Wed, 3 Jul 2024 14:31:55 +0530 Subject: [PATCH 27/34] rename PlainText to PlainTextLineAlignedParser --- src/openpecha/alignment/parsers/plaintext.py | 2 +- tests/alignment/parsers/plaintext/test_plaintext.py | 10 +++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/openpecha/alignment/parsers/plaintext.py b/src/openpecha/alignment/parsers/plaintext.py index 2b62fac..8c9e3dd 100644 --- a/src/openpecha/alignment/parsers/plaintext.py +++ b/src/openpecha/alignment/parsers/plaintext.py @@ -5,7 +5,7 @@ from openpecha.pecha import Pecha -class PlainText: +class PlainTextLineAlignedParser: def __init__(self, source_text: str, target_text: str, metadata: dict): self.source_text = source_text self.target_text = target_text diff --git a/tests/alignment/parsers/plaintext/test_plaintext.py b/tests/alignment/parsers/plaintext/test_plaintext.py index 488cd31..60c1aea 100644 --- a/tests/alignment/parsers/plaintext/test_plaintext.py +++ b/tests/alignment/parsers/plaintext/test_plaintext.py @@ -1,6 +1,6 @@ from pathlib import Path -from openpecha.alignment.parsers.plaintext import PlainText +from openpecha.alignment.parsers.plaintext import PlainTextLineAlignedParser from openpecha.pecha import Pecha @@ -27,7 +27,9 @@ def test_plaintext_parse(): target_path = DATA_DIR / "comments.txt" metadata = get_metadata() - plaintext = PlainText.from_files(source_path, target_path, metadata) + plaintext = PlainTextLineAlignedParser.from_files( + source_path, target_path, metadata + ) plaintext.parse() assert ( @@ -44,7 +46,9 @@ def test_plaintext_save(): target_path = DATA_DIR / "comments.txt" metadata = get_metadata() - plaintext = PlainText.from_files(source_path, target_path, metadata) + plaintext = PlainTextLineAlignedParser.from_files( + source_path, target_path, metadata + ) source_pecha, target_pecha = plaintext.save() assert isinstance( From 7741ab35dbe025955e87c7425831096a26512563 Mon Sep 17 00:00:00 2001 From: Tenzin Date: Wed, 3 Jul 2024 14:54:31 +0530 Subject: [PATCH 28/34] Layer abstraction class --- src/openpecha/pecha/layer.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 src/openpecha/pecha/layer.py diff --git a/src/openpecha/pecha/layer.py b/src/openpecha/pecha/layer.py new file mode 100644 index 0000000..1298d6a --- /dev/null +++ b/src/openpecha/pecha/layer.py @@ -0,0 +1,16 @@ +from enum import Enum + +from pydantic import BaseModel, Field + +from openpecha.ids import get_uuid + + +class LayerEnum(Enum): + segment = "Segment" + commentaries = "Commentaries" + + +class Layer(BaseModel): + id: str = Field(default_factory=get_uuid) + annotation_type: LayerEnum + annotations: dict = Field(default_factory=dict) From 6fd99d19a2dd3b3b209f8fea1e4dd8219ea762c4 Mon Sep 17 00:00:00 2001 From: Tenzin Date: Wed, 3 Jul 2024 15:01:22 +0530 Subject: [PATCH 29/34] join class Annotation in Layer field --- src/openpecha/pecha/annotation.py | 4 +++- src/openpecha/pecha/layer.py | 6 ++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/openpecha/pecha/annotation.py b/src/openpecha/pecha/annotation.py index 99c2132..be58627 100644 --- a/src/openpecha/pecha/annotation.py +++ b/src/openpecha/pecha/annotation.py @@ -1,8 +1,10 @@ from pydantic import BaseModel, Field, ValidationInfo, field_validator +from openpecha.ids import get_uuid + class Annotation(BaseModel): - id_: str + id_: str = Field(default_factory=get_uuid) segment: str start: int = Field(ge=0) end: int = Field(ge=0) diff --git a/src/openpecha/pecha/layer.py b/src/openpecha/pecha/layer.py index 1298d6a..f3fef40 100644 --- a/src/openpecha/pecha/layer.py +++ b/src/openpecha/pecha/layer.py @@ -1,8 +1,10 @@ from enum import Enum +from typing import Dict from pydantic import BaseModel, Field from openpecha.ids import get_uuid +from openpecha.pecha.annotation import Annotation class LayerEnum(Enum): @@ -11,6 +13,6 @@ class LayerEnum(Enum): class Layer(BaseModel): - id: str = Field(default_factory=get_uuid) + id_: str = Field(default_factory=get_uuid) annotation_type: LayerEnum - annotations: dict = Field(default_factory=dict) + annotations: Dict[str, Annotation] = Field(default_factory=dict) From 2ea5a283e038222dffa607dba875ad6d94d96f27 Mon Sep 17 00:00:00 2001 From: Tenzin Date: Wed, 3 Jul 2024 15:55:35 +0530 Subject: [PATCH 30/34] modify/Pecha fields --- src/openpecha/pecha/__init__.py | 95 +++------------------------------ 1 file changed, 8 insertions(+), 87 deletions(-) diff --git a/src/openpecha/pecha/__init__.py b/src/openpecha/pecha/__init__.py index 25a0cb4..2e761ef 100644 --- a/src/openpecha/pecha/__init__.py +++ b/src/openpecha/pecha/__init__.py @@ -13,14 +13,20 @@ ) from openpecha.ids import get_uuid from openpecha.pecha.annotation import Annotation +from openpecha.pecha.layer import Layer, LayerEnum class Pecha: def __init__( - self, pecha_id: str, segments: Dict[str, str], metadata: Dict[str, str] + self, + pecha_id: str, + bases: Dict[str, str] = None, + layers: Dict[str, Dict[LayerEnum, Layer]] = None, + metadata: Dict[str, str] = None, ) -> None: self.pecha_id = pecha_id - self.segments = segments + self.bases = bases + self.layers = layers self.metadata = metadata @classmethod @@ -30,88 +36,3 @@ def from_path(cls, path: str): @classmethod def from_id(cls, pecha_id: str): pass - - def set_annotations(self): - """set annotations for the segments""" - char_count = 0 - for segment_id, segment in self.segments.items(): - annotation = Annotation( - id_=segment_id, - segment=segment, - start=char_count, - end=char_count + len(segment), - ) - char_count += len(segment) - yield annotation - - def create_pecha_folder(self, export_path: Path): - self.export_path = export_path - - pecha_dir = _mkdir(export_path.joinpath(self.pecha_id)) - opf_dir = _mkdir(pecha_dir.joinpath(f"{self.pecha_id}.opf")) - """ write metadata and base file""" - self.metadata_fn = opf_dir.joinpath("metadata.json") - self.metadata_fn.write_text( - json.dumps(self.metadata, indent=4, ensure_ascii=False), encoding="utf-8" - ) - - base_file_name = get_uuid() - base_dir = _mkdir(opf_dir.joinpath("base")) - self.base_fn = Path(base_dir / f"{base_file_name}.txt") - self.base_fn.write_text(self.base_text) - - layers_dir = _mkdir(opf_dir.joinpath("layers")) - layer_id_dir = _mkdir(layers_dir.joinpath(base_file_name)) - - self.annotation_fn = layer_id_dir - - def covert_to_relative_path(self, json_string: str): - """convert the absolute path to relative path for base file path in json string""" - json_object = json.loads(json_string) - for resource in json_object["resources"]: - original_path = Path(resource["@include"]) - resource["@include"] = str(original_path.relative_to(self.export_path)) - return json_object - - def write_annotations(self, export_path: Path = PECHAS_PATH): - if not hasattr(self, "annotations"): - self.annotations = self.set_annotations() - - self.base_text = "".join(self.segments.values()) - - self.create_pecha_folder(export_path) - """write annotations in stam data model""" - self.annotation_store = AnnotationStore(id=PECHA_ANNOTATION_STORE_ID) - self.resource = self.annotation_store.add_resource( - id=self.base_fn.name, filename=self.base_fn.as_posix() - ) - self.dataset = self.annotation_store.add_dataset(id=PECHA_DATASET_ID) - self.dataset.add_key(self.metadata["annotation_category"]) - - unique_annotation_data_id = get_uuid() - for annotation in self.annotations: - target = Selector.textselector( - self.resource, - Offset.simple(annotation.start, annotation.end), - ) - data = [ - { - "id": unique_annotation_data_id, - "key": self.metadata["annotation_category"], - "value": self.metadata["annotation_label"], - "set": self.dataset.id(), - } - ] - self.annotation_store.annotate( - id=annotation.id_, - target=target, - data=data, - ) - """ save annotations in stam data model""" - json_string = self.annotation_store.to_json_string() - json_object = self.covert_to_relative_path(json_string) - with open( - self.annotation_fn.joinpath(f"{self.metadata['annotation_label']}.json"), - "w", - ) as f: - f.write(json.dumps(json_object, indent=4, ensure_ascii=False)) From 3b5f6a2a62cf8cbfc21488869c3dcd1274189bcf Mon Sep 17 00:00:00 2001 From: Tenzin Date: Wed, 3 Jul 2024 16:11:27 +0530 Subject: [PATCH 31/34] modify/PlainTextLineAlignedParser parser func --- src/openpecha/alignment/parsers/plaintext.py | 58 +++++++++++++++----- src/openpecha/pecha/annotation.py | 3 - src/openpecha/pecha/layer.py | 11 ++-- 3 files changed, 47 insertions(+), 25 deletions(-) diff --git a/src/openpecha/alignment/parsers/plaintext.py b/src/openpecha/alignment/parsers/plaintext.py index 8c9e3dd..d1e075a 100644 --- a/src/openpecha/alignment/parsers/plaintext.py +++ b/src/openpecha/alignment/parsers/plaintext.py @@ -1,8 +1,10 @@ from pathlib import Path +from typing import Dict -from openpecha.config import PECHAS_PATH from openpecha.ids import get_initial_pecha_id, get_uuid from openpecha.pecha import Pecha +from openpecha.pecha.annotation import Annotation +from openpecha.pecha.layer import Layer, LayerEnum class PlainTextLineAlignedParser: @@ -17,27 +19,53 @@ def from_files(cls, source_path: Path, target_path: Path, metadata: dict): target_text = target_path.read_text(encoding="utf-8") return cls(source_text, target_text, metadata) - def parse(self): - source_text_lines = self.source_text.split("\n") - target_text_lines = self.target_text.split("\n") - - self.source_segments = {get_uuid(): segment for segment in source_text_lines} - self.target_segments = {get_uuid(): segment for segment in target_text_lines} + def create_pecha_layer(self, base_text: str, annotation: LayerEnum): + """ """ + layer_annotations: Dict[str, Annotation] = {} + char_count = 0 + for segment in base_text.split("\n"): + layer_annotations[get_uuid()] = Annotation( + id_=get_uuid(), + segment=segment, + start=char_count, + end=char_count + len(segment), + ) + char_count += len(segment) - def save(self, base_path: Path = PECHAS_PATH): - if not hasattr(self, "source_segments") or not hasattr(self, "target_segments"): - self.parse() + return Layer(annotation_label=annotation, annotations=layer_annotations) - """ save the source and target pecha""" + def parse(self): source_pecha_id, target_pecha_id = ( get_initial_pecha_id(), get_initial_pecha_id(), ) - source_pecha = Pecha( - source_pecha_id, self.source_segments, self.metadata["source"] + + source_base_files = {get_uuid(): self.source_text} + target_base_files = {get_uuid(): self.target_text} + + source_annotation = LayerEnum(self.metadata["source"]["annotation_label"]) + target_annotation = LayerEnum(self.metadata["target"]["annotation_label"]) + + source_layers = { + get_uuid(): { + source_annotation: self.create_pecha_layer( + self.source_text, source_annotation + ) + } + } + target_layers = { + get_uuid(): { + target_annotation: self.create_pecha_layer( + self.target_text, target_annotation + ), + } + } + + source_pecha = Pecha( # noqa + source_pecha_id, source_base_files, source_layers, self.metadata["source"] ) - target_pecha = Pecha( - target_pecha_id, self.target_segments, self.metadata["target"] + target_pecha = Pecha( # noqa + target_pecha_id, target_base_files, target_layers, self.metadata["target"] ) return source_pecha, target_pecha diff --git a/src/openpecha/pecha/annotation.py b/src/openpecha/pecha/annotation.py index be58627..c7f37c8 100644 --- a/src/openpecha/pecha/annotation.py +++ b/src/openpecha/pecha/annotation.py @@ -1,10 +1,7 @@ from pydantic import BaseModel, Field, ValidationInfo, field_validator -from openpecha.ids import get_uuid - class Annotation(BaseModel): - id_: str = Field(default_factory=get_uuid) segment: str start: int = Field(ge=0) end: int = Field(ge=0) diff --git a/src/openpecha/pecha/layer.py b/src/openpecha/pecha/layer.py index f3fef40..9dc48a8 100644 --- a/src/openpecha/pecha/layer.py +++ b/src/openpecha/pecha/layer.py @@ -1,9 +1,6 @@ from enum import Enum from typing import Dict -from pydantic import BaseModel, Field - -from openpecha.ids import get_uuid from openpecha.pecha.annotation import Annotation @@ -12,7 +9,7 @@ class LayerEnum(Enum): commentaries = "Commentaries" -class Layer(BaseModel): - id_: str = Field(default_factory=get_uuid) - annotation_type: LayerEnum - annotations: Dict[str, Annotation] = Field(default_factory=dict) +class Layer: + def __init__(self, annotation_label: LayerEnum, annotations: Dict[str, Annotation]): + self.annotation_label = annotation_label + self.annotations = annotations From e9ccaef5b51da438cf1753681235cb69aa5ba43c Mon Sep 17 00:00:00 2001 From: Tenzin Date: Thu, 4 Jul 2024 09:18:31 +0530 Subject: [PATCH 32/34] fix/set basefile name and layer file name same --- src/openpecha/alignment/parsers/plaintext.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/openpecha/alignment/parsers/plaintext.py b/src/openpecha/alignment/parsers/plaintext.py index d1e075a..92a1dfd 100644 --- a/src/openpecha/alignment/parsers/plaintext.py +++ b/src/openpecha/alignment/parsers/plaintext.py @@ -40,21 +40,22 @@ def parse(self): get_initial_pecha_id(), ) - source_base_files = {get_uuid(): self.source_text} - target_base_files = {get_uuid(): self.target_text} + source_base_fname, target_base_fname = get_uuid(), get_uuid() + source_base_files = {source_base_fname: self.source_text} + target_base_files = {target_base_fname: self.target_text} source_annotation = LayerEnum(self.metadata["source"]["annotation_label"]) target_annotation = LayerEnum(self.metadata["target"]["annotation_label"]) source_layers = { - get_uuid(): { + source_base_fname: { source_annotation: self.create_pecha_layer( self.source_text, source_annotation ) } } target_layers = { - get_uuid(): { + target_base_fname: { target_annotation: self.create_pecha_layer( self.target_text, target_annotation ), From c6b4fa0254f3f39449d16733cc7504be62a689b9 Mon Sep 17 00:00:00 2001 From: Tenzin Date: Thu, 4 Jul 2024 09:23:06 +0530 Subject: [PATCH 33/34] Layer write layer --- src/openpecha/pecha/layer.py | 59 ++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/src/openpecha/pecha/layer.py b/src/openpecha/pecha/layer.py index 9dc48a8..038336b 100644 --- a/src/openpecha/pecha/layer.py +++ b/src/openpecha/pecha/layer.py @@ -1,6 +1,12 @@ +import json from enum import Enum +from pathlib import Path from typing import Dict +from stam import AnnotationStore, Offset, Selector + +from openpecha.config import PECHA_ANNOTATION_STORE_ID, PECHA_DATASET_ID +from openpecha.ids import get_uuid from openpecha.pecha.annotation import Annotation @@ -9,7 +15,60 @@ class LayerEnum(Enum): commentaries = "Commentaries" +def get_annotation_category(): + # TODO + # Return annotation category based on the annotation label + return "Structure Type" + + class Layer: def __init__(self, annotation_label: LayerEnum, annotations: Dict[str, Annotation]): self.annotation_label = annotation_label self.annotations = annotations + + def covert_to_relative_path(self, json_string: str, export_path: Path): + """convert the absolute path to relative path for base file path in json string""" + json_object = json.loads(json_string) + for resource in json_object["resources"]: + original_path = Path(resource["@include"]) + resource["@include"] = str(original_path.relative_to(export_path)) + return json_object + + def write_layer(self, base_file_path: Path, export_path: Path): + """write annotations in stam data model""" + self.annotation_store = AnnotationStore(id=PECHA_ANNOTATION_STORE_ID) + self.resource = self.annotation_store.add_resource( + id=base_file_path.name, filename=base_file_path.as_posix() + ) + self.dataset = self.annotation_store.add_dataset(id=PECHA_DATASET_ID) + + annotation_category = get_annotation_category() + self.dataset.add_key(annotation_category) + + unique_annotation_data_id = get_uuid() + for annotation_id, annotation in self.annotations.items(): + target = Selector.textselector( + self.resource, + Offset.simple(annotation.start, annotation.end), + ) + data = [ + { + "id": unique_annotation_data_id, + "key": annotation_category, + "value": self.annotation_label.value, + "set": self.dataset.id(), + } + ] + self.annotation_store.annotate( + id=annotation_id, + target=target, + data=data, + ) + """ save annotations in json""" + json_string = self.annotation_store.to_json_string() + json_object = self.covert_to_relative_path(json_string, export_path) + with open( + export_path / f"{self.annotation_label.value}.json", + "w", + ) as f: + f.write(json.dumps(json_object, indent=4, ensure_ascii=False)) From 12a379fa977894cca418db5ff42bf37daea7c71d Mon Sep 17 00:00:00 2001 From: Tenzin Date: Thu, 4 Jul 2024 09:52:34 +0530 Subject: [PATCH 34/34] Pecha write function --- src/openpecha/pecha/__init__.py | 32 +++++++++++++++++++++++++++++--- src/openpecha/pecha/layer.py | 6 ++++-- 2 files changed, 33 insertions(+), 5 deletions(-) diff --git a/src/openpecha/pecha/__init__.py b/src/openpecha/pecha/__init__.py index 2e761ef..781d99e 100644 --- a/src/openpecha/pecha/__init__.py +++ b/src/openpecha/pecha/__init__.py @@ -20,9 +20,9 @@ class Pecha: def __init__( self, pecha_id: str, - bases: Dict[str, str] = None, - layers: Dict[str, Dict[LayerEnum, Layer]] = None, - metadata: Dict[str, str] = None, + bases: Dict[str, str], + layers: Dict[str, Dict[LayerEnum, Layer]], + metadata: Dict[str, str], ) -> None: self.pecha_id = pecha_id self.bases = bases @@ -36,3 +36,29 @@ def from_path(cls, path: str): @classmethod def from_id(cls, pecha_id: str): pass + + def write(self, export_path: Path = PECHAS_PATH): + + pecha_dir = _mkdir(export_path / self.pecha_id) + self.base_path = _mkdir(pecha_dir / f"{self.pecha_id}.opf") + """ write metadata """ + self.metadata_fn = self.base_path / "metadata.json" + self.metadata_fn.write_text( + json.dumps(self.metadata, indent=4, ensure_ascii=False), encoding="utf-8" + ) + + """ write base file""" + base_dir = _mkdir(self.base_path / "base") + for base_fname, base_text in self.bases.items(): + base_fn = base_dir / f"{base_fname}.txt" + base_fn.write_text(base_text, encoding="utf-8") + + layer_dir = _mkdir(self.base_path / "layers") + """ write annotation layers""" + for layer_fname, layer_data in self.layers.items(): + for _, layer in layer_data.items(): + _mkdir(layer_dir / layer_fname) + layer.write( + base_file_path=base_dir / layer_fname, + export_path=layer_dir / layer_fname, + ) diff --git a/src/openpecha/pecha/layer.py b/src/openpecha/pecha/layer.py index 038336b..671896c 100644 --- a/src/openpecha/pecha/layer.py +++ b/src/openpecha/pecha/layer.py @@ -34,7 +34,7 @@ def covert_to_relative_path(self, json_string: str, export_path: Path): resource["@include"] = str(original_path.relative_to(export_path)) return json_object - def write_layer(self, base_file_path: Path, export_path: Path): + def write(self, base_file_path: Path, export_path: Path): """write annotations in stam data model""" self.annotation_store = AnnotationStore(id=PECHA_ANNOTATION_STORE_ID) self.resource = self.annotation_store.add_resource( @@ -67,8 +67,10 @@ def write_layer(self, base_file_path: Path, export_path: Path): """ save annotations in json""" json_string = self.annotation_store.to_json_string() json_object = self.covert_to_relative_path(json_string, export_path) + """ add four uuid digits to the layer file name for uniqueness""" + layer_fname = f"{self.annotation_label.value}-{get_uuid()[:4]}.json" with open( - export_path / f"{self.annotation_label.value}.json", + export_path / layer_fname, "w", ) as f: f.write(json.dumps(json_object, indent=4, ensure_ascii=False))