From 12878b84a3b1db2cd8272197f6421aa2cbbf33dd Mon Sep 17 00:00:00 2001 From: Tenzin Date: Thu, 4 Jul 2024 10:12:30 +0530 Subject: [PATCH 01/49] fix/layer name in Layer write function --- src/openpecha/pecha/__init__.py | 4 ++-- src/openpecha/pecha/layer.py | 7 +++++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/openpecha/pecha/__init__.py b/src/openpecha/pecha/__init__.py index 781d99e..41a7b6f 100644 --- a/src/openpecha/pecha/__init__.py +++ b/src/openpecha/pecha/__init__.py @@ -59,6 +59,6 @@ def write(self, export_path: Path = PECHAS_PATH): for _, layer in layer_data.items(): _mkdir(layer_dir / layer_fname) layer.write( - base_file_path=base_dir / layer_fname, - export_path=layer_dir / layer_fname, + base_file_path=base_dir / f"{layer_fname}.txt", + export_path=export_path, ) diff --git a/src/openpecha/pecha/layer.py b/src/openpecha/pecha/layer.py index 671896c..86441ad 100644 --- a/src/openpecha/pecha/layer.py +++ b/src/openpecha/pecha/layer.py @@ -68,9 +68,12 @@ def write(self, base_file_path: Path, export_path: Path): json_string = self.annotation_store.to_json_string() json_object = self.covert_to_relative_path(json_string, export_path) """ add four uuid digits to the layer file name for uniqueness""" - layer_fname = f"{self.annotation_label.value}-{get_uuid()[:4]}.json" + layer_dir = base_file_path.parent.parent / "layers" / base_file_path.stem + layer_file_path = ( + layer_dir / f"{self.annotation_label.value}-{get_uuid()[:4]}.json" + ) with open( - export_path / layer_fname, + layer_file_path, "w", ) as f: f.write(json.dumps(json_object, indent=4, ensure_ascii=False)) From 2dcf475c893c5b5ea2345ddd8f2b1bb28a14d816 Mon Sep 17 00:00:00 2001 From: Tenzin Date: Thu, 4 Jul 2024 10:19:29 +0530 Subject: [PATCH 02/49] moodify LayerEnum value --- src/openpecha/pecha/layer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/openpecha/pecha/layer.py b/src/openpecha/pecha/layer.py index 86441ad..b9f08c7 100644 --- a/src/openpecha/pecha/layer.py +++ b/src/openpecha/pecha/layer.py @@ -12,7 +12,7 @@ class LayerEnum(Enum): segment = "Segment" - commentaries = "Commentaries" + commentaries = "Comment" def get_annotation_category(): From 2558f1904e30c0fa390ffb39398194344cd1217a Mon Sep 17 00:00:00 2001 From: Tenzin Date: Thu, 4 Jul 2024 10:20:36 +0530 Subject: [PATCH 03/49] modify/meta data defination in test_plaintext --- .../parsers/plaintext/test_plaintext.py | 25 ++----------------- 1 file changed, 2 insertions(+), 23 deletions(-) diff --git a/tests/alignment/parsers/plaintext/test_plaintext.py b/tests/alignment/parsers/plaintext/test_plaintext.py index 60c1aea..78084fb 100644 --- a/tests/alignment/parsers/plaintext/test_plaintext.py +++ b/tests/alignment/parsers/plaintext/test_plaintext.py @@ -11,17 +11,15 @@ def get_data_dir(): def get_metadata(): return { "source": { - "annotation_category": "Structure Type", "annotation_label": "Segment", }, "target": { - "annotation_category": "Structure Type", "annotation_label": "Comment", }, } -def test_plaintext_parse(): +def test_PlainTextLineAlignedParser_parse(): DATA_DIR = get_data_dir() source_path = DATA_DIR / "segments.txt" target_path = DATA_DIR / "comments.txt" @@ -30,26 +28,7 @@ def test_plaintext_parse(): plaintext = PlainTextLineAlignedParser.from_files( source_path, target_path, metadata ) - plaintext.parse() - - assert ( - len(plaintext.source_segments) == 5 - ), "plaintext parser is not parsing source_segments correctly" - assert ( - len(plaintext.target_segments) == 5 - ), "plaintext parser is not parsing target_segments correctly" - - -def test_plaintext_save(): - DATA_DIR = get_data_dir() - source_path = DATA_DIR / "segments.txt" - target_path = DATA_DIR / "comments.txt" - - metadata = get_metadata() - plaintext = PlainTextLineAlignedParser.from_files( - source_path, target_path, metadata - ) - source_pecha, target_pecha = plaintext.save() + source_pecha, target_pecha = plaintext.parse() assert isinstance( source_pecha, Pecha From d2107160c895c4c2d9fd3188126a578f52cfc2f1 Mon Sep 17 00:00:00 2001 From: Tenzin Date: Thu, 4 Jul 2024 10:51:03 +0530 Subject: [PATCH 04/49] test for pecha write function --- .../base/f2b056668a0c4ad3a085bdcd8e2d7adb.txt | 1 + .../Segment-bf13.json | 88 +++++++++++++++++++ .../IE7D6875F/IE7D6875F.opf/metadata.json | 3 + tests/pecha/test_pecha.py | 69 +++++++-------- 4 files changed, 122 insertions(+), 39 deletions(-) create mode 100644 tests/pecha/expected_output/IE7D6875F/IE7D6875F.opf/base/f2b056668a0c4ad3a085bdcd8e2d7adb.txt create mode 100644 tests/pecha/expected_output/IE7D6875F/IE7D6875F.opf/layers/f2b056668a0c4ad3a085bdcd8e2d7adb/Segment-bf13.json create mode 100644 tests/pecha/expected_output/IE7D6875F/IE7D6875F.opf/metadata.json diff --git a/tests/pecha/expected_output/IE7D6875F/IE7D6875F.opf/base/f2b056668a0c4ad3a085bdcd8e2d7adb.txt b/tests/pecha/expected_output/IE7D6875F/IE7D6875F.opf/base/f2b056668a0c4ad3a085bdcd8e2d7adb.txt new file mode 100644 index 0000000..0b166fc --- /dev/null +++ b/tests/pecha/expected_output/IE7D6875F/IE7D6875F.opf/base/f2b056668a0c4ad3a085bdcd8e2d7adb.txt @@ -0,0 +1 @@ +རྒྱ་གར་སྐད་དུ། བོ་དྷི་སཏྭ་ཙརྱ་ཨ་བ་ཏཱ་ར།བོད་སྐད་དུ། བྱང་ཆུབ་སེམས་དཔའི་སྤྱོད་པ་ལ་འཇུག་པའི་ལེགས་པར་སྦྱར་བ། \ No newline at end of file diff --git a/tests/pecha/expected_output/IE7D6875F/IE7D6875F.opf/layers/f2b056668a0c4ad3a085bdcd8e2d7adb/Segment-bf13.json b/tests/pecha/expected_output/IE7D6875F/IE7D6875F.opf/layers/f2b056668a0c4ad3a085bdcd8e2d7adb/Segment-bf13.json new file mode 100644 index 0000000..92bcaec --- /dev/null +++ b/tests/pecha/expected_output/IE7D6875F/IE7D6875F.opf/layers/f2b056668a0c4ad3a085bdcd8e2d7adb/Segment-bf13.json @@ -0,0 +1,88 @@ +{ + "@type": "AnnotationStore", + "@id": "PechaAnnotationStore", + "resources": [ + { + "@type": "TextResource", + "@id": "f2b056668a0c4ad3a085bdcd8e2d7adb.txt", + "@include": "IE7D6875F/IE7D6875F.opf/base/f2b056668a0c4ad3a085bdcd8e2d7adb.txt" + } + ], + "annotationsets": [ + { + "@type": "AnnotationDataSet", + "@id": "PechaDataSet", + "keys": [ + { + "@type": "DataKey", + "@id": "Structure Type" + } + ], + "data": [ + { + "@type": "AnnotationData", + "@id": "0c2c4165fb58464eabf9db0d6a3a1080", + "key": "Structure Type", + "value": { + "@type": "String", + "value": "Segment" + } + } + ] + } + ], + "annotations": [ + { + "@type": "Annotation", + "@id": "f2b056668a0c4ad3a085bdcd8e2d7adb", + "target": { + "@type": "TextSelector", + "resource": "f2b056668a0c4ad3a085bdcd8e2d7adb.txt", + "offset": { + "@type": "Offset", + "begin": { + "@type": "BeginAlignedCursor", + "value": 0 + }, + "end": { + "@type": "BeginAlignedCursor", + "value": 39 + } + } + }, + "data": [ + { + "@type": "AnnotationData", + "@id": "0c2c4165fb58464eabf9db0d6a3a1080", + "set": "PechaDataSet" + } + ] + }, + { + "@type": "Annotation", + "@id": "b696df2dbe314e8a87881a2bc391d0d5", + "target": { + "@type": "TextSelector", + "resource": "f2b056668a0c4ad3a085bdcd8e2d7adb.txt", + "offset": { + "@type": "Offset", + "begin": { + "@type": "BeginAlignedCursor", + "value": 39 + }, + "end": { + "@type": "BeginAlignedCursor", + "value": 103 + } + } + }, + "data": [ + { + "@type": "AnnotationData", + "@id": "0c2c4165fb58464eabf9db0d6a3a1080", + "set": "PechaDataSet" + } + ] + } + ] +} \ No newline at end of file diff --git a/tests/pecha/expected_output/IE7D6875F/IE7D6875F.opf/metadata.json b/tests/pecha/expected_output/IE7D6875F/IE7D6875F.opf/metadata.json new file mode 100644 index 0000000..cb740ab --- /dev/null +++ b/tests/pecha/expected_output/IE7D6875F/IE7D6875F.opf/metadata.json @@ -0,0 +1,3 @@ +{ + "annotation_label": "Segment" +} \ No newline at end of file diff --git a/tests/pecha/test_pecha.py b/tests/pecha/test_pecha.py index 47f8e18..52ba75d 100644 --- a/tests/pecha/test_pecha.py +++ b/tests/pecha/test_pecha.py @@ -3,75 +3,66 @@ from openpecha.pecha import Pecha from openpecha.pecha.annotation import Annotation +from openpecha.pecha.layer import Layer, LayerEnum def get_data_dir(): - export_path = Path(__file__).parent / "data" + export_path = Path(__file__).parent / "output" export_path.mkdir(parents=True, exist_ok=True) return export_path -def get_segments(): +def get_metadata(): return { - "f2b056668a0c4ad3a085bdcd8e2d7adb": "རྒྱ་གར་སྐད་དུ། བོ་དྷི་སཏྭ་ཙརྱ་ཨ་བ་ཏཱ་ར།", - "b696df2dbe314e8a87881a2bc391d0d5": "བོད་སྐད་དུ། བྱང་ཆུབ་སེམས་དཔའི་སྤྱོད་པ་ལ་འཇུག་པའི་ལེགས་པར་སྦྱར་བ།", + "annotation_label": "Segment", } -def get_metadata(): +def get_base(): return { - "annotation_category": "Structure Type", - "annotation_label": "Segment", + "f2b056668a0c4ad3a085bdcd8e2d7adb": "རྒྱ་གར་སྐད་དུ། བོ་དྷི་སཏྭ་ཙརྱ་ཨ་བ་ཏཱ་ར།བོད་སྐད་དུ། བྱང་ཆུབ་སེམས་དཔའི་སྤྱོད་པ་ལ་འཇུག་པའི་ལེགས་པར་སྦྱར་བ།" # noqa } -def get_expected_annotations(): - expected_annotations = [ - Annotation( - id_="f2b056668a0c4ad3a085bdcd8e2d7adb", +def get_layer(): + return { + "f2b056668a0c4ad3a085bdcd8e2d7adb": { + LayerEnum("Segment"): Layer(LayerEnum("Segment"), get_annotations()) + } + } + + +def get_annotations(): + return { + "f2b056668a0c4ad3a085bdcd8e2d7adb": Annotation( segment="རྒྱ་གར་སྐད་དུ། བོ་དྷི་སཏྭ་ཙརྱ་ཨ་བ་ཏཱ་ར།", start=0, end=39, metadata={}, ), - Annotation( - id_="b696df2dbe314e8a87881a2bc391d0d5", + "b696df2dbe314e8a87881a2bc391d0d5": Annotation( segment="བོད་སྐད་དུ། བྱང་ཆུབ་སེམས་དཔའི་སྤྱོད་པ་ལ་འཇུག་པའི་ལེགས་པར་སྦྱར་བ།", start=39, end=103, metadata={}, ), - ] - return expected_annotations + } -def test_pecha_set_annotations(): +def test_pecha_write(): pecha_id = "IE7D6875F" - segments = get_segments() - metadata = get_metadata() - pecha = Pecha(pecha_id=pecha_id, segments=segments, metadata=metadata) - assert isinstance( - pecha, Pecha - ), "Not able to create Pecha object with id, segments and metadata" + base = get_base() + layer = get_layer() + export_path = get_data_dir() + expected_output_path = Path(__file__).parent / "expected_output" - annotations = list(pecha.set_annotations()) - assert ( - annotations == get_expected_annotations() - ), "Pecha not able to set annotations for the segments" + pecha = Pecha(pecha_id=pecha_id, bases=base, layers=layer, metadata=get_metadata()) + pecha.write(export_path=export_path) + output_file_names = [file.name for file in export_path.rglob("*")].sort() + expected_file_names = [file.name for file in expected_output_path.rglob("*")].sort() -def test_pecha_write_annotations(): - pecha_id = "IE7D6875F" - segments = get_segments() - metadata = get_metadata() - pecha = Pecha(pecha_id=pecha_id, segments=segments, metadata=metadata) - export_path = get_data_dir() - pecha.write_annotations(export_path=export_path) - assert pecha.base_fn.exists(), "Pecha not able to write base file" - assert pecha.metadata_fn.exists(), "Pecha not able to write metadata file" - assert pecha.annotation_fn.rglob( - "*.json" - ), "Pecha not able to write annotation file" + assert output_file_names == expected_file_names """ clean up """ - rmtree(Path(export_path / pecha_id)) + rmtree(export_path) From 7437fb07c45b0137fd38ad405348f9fce9d27970 Mon Sep 17 00:00:00 2001 From: Tenzin Date: Thu, 4 Jul 2024 11:20:23 +0530 Subject: [PATCH 05/49] raise Value Error if annotation segment doesnt match the base text --- src/openpecha/pecha/layer.py | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/src/openpecha/pecha/layer.py b/src/openpecha/pecha/layer.py index b9f08c7..60a797e 100644 --- a/src/openpecha/pecha/layer.py +++ b/src/openpecha/pecha/layer.py @@ -34,19 +34,19 @@ def covert_to_relative_path(self, json_string: str, export_path: Path): resource["@include"] = str(original_path.relative_to(export_path)) return json_object - def write(self, base_file_path: Path, export_path: Path): - """write annotations in stam data model""" - self.annotation_store = AnnotationStore(id=PECHA_ANNOTATION_STORE_ID) - self.resource = self.annotation_store.add_resource( - id=base_file_path.name, filename=base_file_path.as_posix() - ) - self.dataset = self.annotation_store.add_dataset(id=PECHA_DATASET_ID) - + def set_annotations(self): annotation_category = get_annotation_category() self.dataset.add_key(annotation_category) - unique_annotation_data_id = get_uuid() + base_text = self.base_file_path.read_text(encoding="utf-8") for annotation_id, annotation in self.annotations.items(): + if ( + annotation.segment + != base_text[annotation.start : annotation.end] # noqa + ): + raise ValueError( + f"Annotation segment does not match the base text at {annotation_id}" + ) target = Selector.textselector( self.resource, Offset.simple(annotation.start, annotation.end), @@ -64,6 +64,16 @@ def write(self, base_file_path: Path, export_path: Path): target=target, data=data, ) + + def write(self, base_file_path: Path, export_path: Path): + self.base_file_path = base_file_path + """write annotations in stam data model""" + self.annotation_store = AnnotationStore(id=PECHA_ANNOTATION_STORE_ID) + self.resource = self.annotation_store.add_resource( + id=base_file_path.name, filename=base_file_path.as_posix() + ) + self.dataset = self.annotation_store.add_dataset(id=PECHA_DATASET_ID) + self.set_annotations() """ save annotations in json""" json_string = self.annotation_store.to_json_string() json_object = self.covert_to_relative_path(json_string, export_path) From 57991fe902b3628e717f622bafb044e92f3218a1 Mon Sep 17 00:00:00 2001 From: Tenzin Date: Thu, 4 Jul 2024 11:38:57 +0530 Subject: [PATCH 06/49] Layer function set annotation --- src/openpecha/alignment/parsers/plaintext.py | 8 +++---- src/openpecha/pecha/layer.py | 24 +++++++++++--------- 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/src/openpecha/alignment/parsers/plaintext.py b/src/openpecha/alignment/parsers/plaintext.py index 92a1dfd..c151000 100644 --- a/src/openpecha/alignment/parsers/plaintext.py +++ b/src/openpecha/alignment/parsers/plaintext.py @@ -1,5 +1,4 @@ from pathlib import Path -from typing import Dict from openpecha.ids import get_initial_pecha_id, get_uuid from openpecha.pecha import Pecha @@ -21,18 +20,19 @@ def from_files(cls, source_path: Path, target_path: Path, metadata: dict): def create_pecha_layer(self, base_text: str, annotation: LayerEnum): """ """ - layer_annotations: Dict[str, Annotation] = {} + layer = Layer(annotation_label=annotation, annotations={}) char_count = 0 for segment in base_text.split("\n"): - layer_annotations[get_uuid()] = Annotation( + annotation = Annotation( id_=get_uuid(), segment=segment, start=char_count, end=char_count + len(segment), ) + layer.set_annotation(annotation) char_count += len(segment) - return Layer(annotation_label=annotation, annotations=layer_annotations) + return layer def parse(self): source_pecha_id, target_pecha_id = ( diff --git a/src/openpecha/pecha/layer.py b/src/openpecha/pecha/layer.py index 60a797e..e888f94 100644 --- a/src/openpecha/pecha/layer.py +++ b/src/openpecha/pecha/layer.py @@ -26,6 +26,11 @@ def __init__(self, annotation_label: LayerEnum, annotations: Dict[str, Annotatio self.annotation_label = annotation_label self.annotations = annotations + def set_annotation(self, annotation: Annotation, annotation_id=None): + if not annotation_id: + annotation_id = get_uuid() + self.annotations[annotation_id] = annotation + def covert_to_relative_path(self, json_string: str, export_path: Path): """convert the absolute path to relative path for base file path in json string""" json_object = json.loads(json_string) @@ -34,7 +39,14 @@ def covert_to_relative_path(self, json_string: str, export_path: Path): resource["@include"] = str(original_path.relative_to(export_path)) return json_object - def set_annotations(self): + def write(self, base_file_path: Path, export_path: Path): + self.base_file_path = base_file_path + """write annotations in stam data model""" + self.annotation_store = AnnotationStore(id=PECHA_ANNOTATION_STORE_ID) + self.resource = self.annotation_store.add_resource( + id=base_file_path.name, filename=base_file_path.as_posix() + ) + self.dataset = self.annotation_store.add_dataset(id=PECHA_DATASET_ID) annotation_category = get_annotation_category() self.dataset.add_key(annotation_category) unique_annotation_data_id = get_uuid() @@ -64,16 +76,6 @@ def set_annotations(self): target=target, data=data, ) - - def write(self, base_file_path: Path, export_path: Path): - self.base_file_path = base_file_path - """write annotations in stam data model""" - self.annotation_store = AnnotationStore(id=PECHA_ANNOTATION_STORE_ID) - self.resource = self.annotation_store.add_resource( - id=base_file_path.name, filename=base_file_path.as_posix() - ) - self.dataset = self.annotation_store.add_dataset(id=PECHA_DATASET_ID) - self.set_annotations() """ save annotations in json""" json_string = self.annotation_store.to_json_string() json_object = self.covert_to_relative_path(json_string, export_path) From 6879c9bd9a420e7f16141d7daa3db081bf4ae84b Mon Sep 17 00:00:00 2001 From: Tenzin Date: Thu, 4 Jul 2024 11:53:21 +0530 Subject: [PATCH 07/49] Pecha set base file, layer and metadata file --- src/openpecha/alignment/parsers/plaintext.py | 37 ++++++-------- src/openpecha/pecha/__init__.py | 54 ++++++++++++++------ 2 files changed, 53 insertions(+), 38 deletions(-) diff --git a/src/openpecha/alignment/parsers/plaintext.py b/src/openpecha/alignment/parsers/plaintext.py index c151000..274e4bf 100644 --- a/src/openpecha/alignment/parsers/plaintext.py +++ b/src/openpecha/alignment/parsers/plaintext.py @@ -39,35 +39,30 @@ def parse(self): get_initial_pecha_id(), get_initial_pecha_id(), ) + source_pecha = Pecha(source_pecha_id) + target_pecha = Pecha(target_pecha_id) source_base_fname, target_base_fname = get_uuid(), get_uuid() - source_base_files = {source_base_fname: self.source_text} - target_base_files = {target_base_fname: self.target_text} + source_pecha.set_base_file(source_base_fname, self.source_text) + target_pecha.set_base_file(target_base_fname, self.target_text) source_annotation = LayerEnum(self.metadata["source"]["annotation_label"]) target_annotation = LayerEnum(self.metadata["target"]["annotation_label"]) - source_layers = { - source_base_fname: { - source_annotation: self.create_pecha_layer( - self.source_text, source_annotation - ) - } - } - target_layers = { - target_base_fname: { - target_annotation: self.create_pecha_layer( - self.target_text, target_annotation - ), - } - } - - source_pecha = Pecha( # noqa - source_pecha_id, source_base_files, source_layers, self.metadata["source"] + source_pecha.set_layer( + source_base_fname, + source_annotation, + self.create_pecha_layer(self.source_text, source_annotation), ) - target_pecha = Pecha( # noqa - target_pecha_id, target_base_files, target_layers, self.metadata["target"] + target_pecha.set_layer( + target_base_fname, + target_annotation, + self.create_pecha_layer(self.target_text, target_annotation), ) + + source_pecha.set_metadata(self.metadata["source"]) + target_pecha.set_metadata(self.metadata["target"]) + return source_pecha, target_pecha # TODO: diff --git a/src/openpecha/pecha/__init__.py b/src/openpecha/pecha/__init__.py index 41a7b6f..bca961a 100644 --- a/src/openpecha/pecha/__init__.py +++ b/src/openpecha/pecha/__init__.py @@ -20,9 +20,9 @@ class Pecha: def __init__( self, pecha_id: str, - bases: Dict[str, str], - layers: Dict[str, Dict[LayerEnum, Layer]], - metadata: Dict[str, str], + bases: Dict[str, str] = None, + layers: Dict[str, Dict[LayerEnum, Layer]] = None, + metadata: Dict[str, str] = None, ) -> None: self.pecha_id = pecha_id self.bases = bases @@ -37,6 +37,25 @@ def from_path(cls, path: str): def from_id(cls, pecha_id: str): pass + def set_base_file(self, base_file_name: str, base_text: str): + if not self.bases: + self.bases = {} + self.bases[base_file_name] = base_text + + def set_layer(self, layer_dir: str, layer: LayerEnum, layer_data: Layer): + """Note layer dir should be same as its corresponding base file name""" + if not self.layers: + self.layers = {} + if layer_dir not in self.layers: + self.layers[layer_dir] = {} + self.layers[layer_dir][layer] = layer_data + + def set_metadata(self, metadata: Dict[str, str]): + if not self.metadata: + self.metadata = {} + for key, value in metadata.items(): + self.metadata[key] = value + def write(self, export_path: Path = PECHAS_PATH): pecha_dir = _mkdir(export_path / self.pecha_id) @@ -48,17 +67,18 @@ def write(self, export_path: Path = PECHAS_PATH): ) """ write base file""" - base_dir = _mkdir(self.base_path / "base") - for base_fname, base_text in self.bases.items(): - base_fn = base_dir / f"{base_fname}.txt" - base_fn.write_text(base_text, encoding="utf-8") - - layer_dir = _mkdir(self.base_path / "layers") - """ write annotation layers""" - for layer_fname, layer_data in self.layers.items(): - for _, layer in layer_data.items(): - _mkdir(layer_dir / layer_fname) - layer.write( - base_file_path=base_dir / f"{layer_fname}.txt", - export_path=export_path, - ) + if self.bases: + base_dir = _mkdir(self.base_path / "base") + for base_fname, base_text in self.bases.items(): + base_fn = base_dir / f"{base_fname}.txt" + base_fn.write_text(base_text, encoding="utf-8") + if self.layers: + layer_dir = _mkdir(self.base_path / "layers") + """ write annotation layers""" + for layer_fname, layer_data in self.layers.items(): + for _, layer in layer_data.items(): + _mkdir(layer_dir / layer_fname) + layer.write( + base_file_path=base_dir / f"{layer_fname}.txt", + export_path=export_path, + ) From 1eb9be4f4e4c021b79d9aad9e46a7ce2e897f07d Mon Sep 17 00:00:00 2001 From: Tenzin Date: Thu, 4 Jul 2024 11:54:26 +0530 Subject: [PATCH 08/49] refactor --- src/openpecha/pecha/__init__.py | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/src/openpecha/pecha/__init__.py b/src/openpecha/pecha/__init__.py index bca961a..d3be453 100644 --- a/src/openpecha/pecha/__init__.py +++ b/src/openpecha/pecha/__init__.py @@ -1,18 +1,8 @@ import json from pathlib import Path -from shutil import rmtree from typing import Dict -from stam import AnnotationStore, Offset, Selector - -from openpecha.config import ( - PECHA_ANNOTATION_STORE_ID, - PECHA_DATASET_ID, - PECHAS_PATH, - _mkdir, -) -from openpecha.ids import get_uuid -from openpecha.pecha.annotation import Annotation +from openpecha.config import PECHAS_PATH, _mkdir from openpecha.pecha.layer import Layer, LayerEnum From 47290948bd8fbcc10136f03c259cff5c8fd6dfee Mon Sep 17 00:00:00 2001 From: Tenzin Date: Thu, 4 Jul 2024 12:03:46 +0530 Subject: [PATCH 09/49] refactor code --- src/openpecha/alignment/parsers/plaintext.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/openpecha/alignment/parsers/plaintext.py b/src/openpecha/alignment/parsers/plaintext.py index 274e4bf..95a38bd 100644 --- a/src/openpecha/alignment/parsers/plaintext.py +++ b/src/openpecha/alignment/parsers/plaintext.py @@ -1,4 +1,5 @@ from pathlib import Path +from typing import List from openpecha.ids import get_initial_pecha_id, get_uuid from openpecha.pecha import Pecha @@ -18,13 +19,12 @@ def from_files(cls, source_path: Path, target_path: Path, metadata: dict): target_text = target_path.read_text(encoding="utf-8") return cls(source_text, target_text, metadata) - def create_pecha_layer(self, base_text: str, annotation: LayerEnum): + def create_pecha_layer(self, segments: List[str], annotation_label: LayerEnum): """ """ - layer = Layer(annotation_label=annotation, annotations={}) + layer = Layer(annotation_label=annotation_label, annotations={}) char_count = 0 - for segment in base_text.split("\n"): + for segment in segments: annotation = Annotation( - id_=get_uuid(), segment=segment, start=char_count, end=char_count + len(segment), @@ -52,12 +52,12 @@ def parse(self): source_pecha.set_layer( source_base_fname, source_annotation, - self.create_pecha_layer(self.source_text, source_annotation), + self.create_pecha_layer(self.source_text.split("\n"), source_annotation), ) target_pecha.set_layer( target_base_fname, target_annotation, - self.create_pecha_layer(self.target_text, target_annotation), + self.create_pecha_layer(self.target_text.split("\n"), target_annotation), ) source_pecha.set_metadata(self.metadata["source"]) From 40b9781bc3a64b91cb35203c44d2e5c22040c079 Mon Sep 17 00:00:00 2001 From: Tenzin Date: Thu, 4 Jul 2024 12:16:26 +0530 Subject: [PATCH 10/49] modify/set Tuple of LayerEnum and str as Layer key --- src/openpecha/alignment/parsers/plaintext.py | 4 ++-- src/openpecha/pecha/__init__.py | 16 ++++++++++++---- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/src/openpecha/alignment/parsers/plaintext.py b/src/openpecha/alignment/parsers/plaintext.py index 95a38bd..1ea8ded 100644 --- a/src/openpecha/alignment/parsers/plaintext.py +++ b/src/openpecha/alignment/parsers/plaintext.py @@ -51,12 +51,12 @@ def parse(self): source_pecha.set_layer( source_base_fname, - source_annotation, + (source_annotation, None), self.create_pecha_layer(self.source_text.split("\n"), source_annotation), ) target_pecha.set_layer( target_base_fname, - target_annotation, + (target_annotation, None), self.create_pecha_layer(self.target_text.split("\n"), target_annotation), ) diff --git a/src/openpecha/pecha/__init__.py b/src/openpecha/pecha/__init__.py index d3be453..77b2e18 100644 --- a/src/openpecha/pecha/__init__.py +++ b/src/openpecha/pecha/__init__.py @@ -1,8 +1,9 @@ import json from pathlib import Path -from typing import Dict +from typing import Dict, Optional, Tuple from openpecha.config import PECHAS_PATH, _mkdir +from openpecha.ids import get_uuid from openpecha.pecha.layer import Layer, LayerEnum @@ -11,7 +12,7 @@ def __init__( self, pecha_id: str, bases: Dict[str, str] = None, - layers: Dict[str, Dict[LayerEnum, Layer]] = None, + layers: Dict[str, Dict[Tuple[LayerEnum, str], Layer]] = None, metadata: Dict[str, str] = None, ) -> None: self.pecha_id = pecha_id @@ -32,13 +33,20 @@ def set_base_file(self, base_file_name: str, base_text: str): self.bases = {} self.bases[base_file_name] = base_text - def set_layer(self, layer_dir: str, layer: LayerEnum, layer_data: Layer): + def set_layer( + self, layer_dir: str, layer_key: Tuple[LayerEnum, Optional[str]], layer: Layer + ): """Note layer dir should be same as its corresponding base file name""" if not self.layers: self.layers = {} if layer_dir not in self.layers: self.layers[layer_dir] = {} - self.layers[layer_dir][layer] = layer_data + + """ layer key is a tuple of layer label and layer id""" + """ A particular volume can have multiple layers with same label but different id""" + layer_label, layer_id = layer_key + layer_id = layer_id if layer_id else get_uuid() + self.layers[layer_dir][(layer_label, layer_id)] = layer def set_metadata(self, metadata: Dict[str, str]): if not self.metadata: From ba8c3c94f164a6ec2b4ad665f65c5a8e7c235f2f Mon Sep 17 00:00:00 2001 From: Tenzin Date: Thu, 4 Jul 2024 12:21:23 +0530 Subject: [PATCH 11/49] get annotation category --- src/openpecha/pecha/layer.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/openpecha/pecha/layer.py b/src/openpecha/pecha/layer.py index e888f94..60012cc 100644 --- a/src/openpecha/pecha/layer.py +++ b/src/openpecha/pecha/layer.py @@ -15,10 +15,13 @@ class LayerEnum(Enum): commentaries = "Comment" -def get_annotation_category(): - # TODO - # Return annotation category based on the annotation label - return "Structure Type" +class LayerGroupEnum(Enum): + structure_type = "Structure Type" + + +def get_annotation_category(layer_label: LayerEnum) -> LayerGroupEnum: + """return the annotation category for the layer label""" + return LayerGroupEnum.structure_type class Layer: @@ -47,7 +50,7 @@ def write(self, base_file_path: Path, export_path: Path): id=base_file_path.name, filename=base_file_path.as_posix() ) self.dataset = self.annotation_store.add_dataset(id=PECHA_DATASET_ID) - annotation_category = get_annotation_category() + annotation_category = get_annotation_category(self.annotation_label).value self.dataset.add_key(annotation_category) unique_annotation_data_id = get_uuid() base_text = self.base_file_path.read_text(encoding="utf-8") From f22e47e14fb6acfe18f037a3598350a39e11259f Mon Sep 17 00:00:00 2001 From: Tenzin Date: Thu, 4 Jul 2024 12:31:48 +0530 Subject: [PATCH 12/49] refactor test folder structure --- .../IE7D6875F.opf/base/f2b056668a0c4ad3a085bdcd8e2d7adb.txt | 0 .../layers/f2b056668a0c4ad3a085bdcd8e2d7adb/Segment-bf13.json | 0 .../expected_output/IE7D6875F/IE7D6875F.opf/metadata.json | 0 tests/pecha/{ => write}/test_pecha.py | 0 4 files changed, 0 insertions(+), 0 deletions(-) rename tests/pecha/{ => write}/expected_output/IE7D6875F/IE7D6875F.opf/base/f2b056668a0c4ad3a085bdcd8e2d7adb.txt (100%) rename tests/pecha/{ => write}/expected_output/IE7D6875F/IE7D6875F.opf/layers/f2b056668a0c4ad3a085bdcd8e2d7adb/Segment-bf13.json (100%) rename tests/pecha/{ => write}/expected_output/IE7D6875F/IE7D6875F.opf/metadata.json (100%) rename tests/pecha/{ => write}/test_pecha.py (100%) diff --git a/tests/pecha/expected_output/IE7D6875F/IE7D6875F.opf/base/f2b056668a0c4ad3a085bdcd8e2d7adb.txt b/tests/pecha/write/expected_output/IE7D6875F/IE7D6875F.opf/base/f2b056668a0c4ad3a085bdcd8e2d7adb.txt similarity index 100% rename from tests/pecha/expected_output/IE7D6875F/IE7D6875F.opf/base/f2b056668a0c4ad3a085bdcd8e2d7adb.txt rename to tests/pecha/write/expected_output/IE7D6875F/IE7D6875F.opf/base/f2b056668a0c4ad3a085bdcd8e2d7adb.txt diff --git a/tests/pecha/expected_output/IE7D6875F/IE7D6875F.opf/layers/f2b056668a0c4ad3a085bdcd8e2d7adb/Segment-bf13.json b/tests/pecha/write/expected_output/IE7D6875F/IE7D6875F.opf/layers/f2b056668a0c4ad3a085bdcd8e2d7adb/Segment-bf13.json similarity index 100% rename from tests/pecha/expected_output/IE7D6875F/IE7D6875F.opf/layers/f2b056668a0c4ad3a085bdcd8e2d7adb/Segment-bf13.json rename to tests/pecha/write/expected_output/IE7D6875F/IE7D6875F.opf/layers/f2b056668a0c4ad3a085bdcd8e2d7adb/Segment-bf13.json diff --git a/tests/pecha/expected_output/IE7D6875F/IE7D6875F.opf/metadata.json b/tests/pecha/write/expected_output/IE7D6875F/IE7D6875F.opf/metadata.json similarity index 100% rename from tests/pecha/expected_output/IE7D6875F/IE7D6875F.opf/metadata.json rename to tests/pecha/write/expected_output/IE7D6875F/IE7D6875F.opf/metadata.json diff --git a/tests/pecha/test_pecha.py b/tests/pecha/write/test_pecha.py similarity index 100% rename from tests/pecha/test_pecha.py rename to tests/pecha/write/test_pecha.py From 11f79bbd7ad18b5b7f5ac240f424c3f74008c25b Mon Sep 17 00:00:00 2001 From: Tenzin Date: Thu, 4 Jul 2024 16:12:15 +0530 Subject: [PATCH 13/49] Layer class method from_path --- src/openpecha/pecha/layer.py | 42 ++++++++++++++++++++++++++++++------ 1 file changed, 36 insertions(+), 6 deletions(-) diff --git a/src/openpecha/pecha/layer.py b/src/openpecha/pecha/layer.py index 60012cc..241b5b2 100644 --- a/src/openpecha/pecha/layer.py +++ b/src/openpecha/pecha/layer.py @@ -24,22 +24,53 @@ def get_annotation_category(layer_label: LayerEnum) -> LayerGroupEnum: return LayerGroupEnum.structure_type +def convert_relative_to_absolute_path(json_data, absolute_base_path: Path): + """call after loading the stam from json""" + for resource in json_data["resources"]: + original_path = Path(resource["@include"]) + resource["@include"] = str(absolute_base_path / original_path) + return json_data + + class Layer: def __init__(self, annotation_label: LayerEnum, annotations: Dict[str, Annotation]): self.annotation_label = annotation_label self.annotations = annotations + @classmethod + def from_path(cls, layer_file_path: Path): + """get annotation label""" + annotation_label = LayerEnum(layer_file_path.stem.split("-")[0]) + """ load annotations from json""" + with open(layer_file_path) as f: + json_data = json.load(f) + absolute_base_path = layer_file_path.parents[4] + json_data = convert_relative_to_absolute_path(json_data, absolute_base_path) + annotation_store = AnnotationStore(string=json.dumps(json_data)) + + layer_annotations: Dict[str, Annotation] = {} + for annotation in annotation_store.annotations(): + annotation_id, segment = annotation.id(), str(annotation) + start = annotation.offset().begin().value() + end = annotation.offset().end().value() + layer_annotations[annotation_id] = Annotation( + segment=segment, start=start, end=end + ) + + return cls(annotation_label, layer_annotations) + def set_annotation(self, annotation: Annotation, annotation_id=None): if not annotation_id: annotation_id = get_uuid() self.annotations[annotation_id] = annotation - def covert_to_relative_path(self, json_string: str, export_path: Path): - """convert the absolute path to relative path for base file path in json string""" + def convert_absolute_to_relative_path(self, absolute_base_path: Path): + """call before saving the stam in json""" + json_string = self.annotation_store.to_json() json_object = json.loads(json_string) for resource in json_object["resources"]: original_path = Path(resource["@include"]) - resource["@include"] = str(original_path.relative_to(export_path)) + resource["@include"] = str(original_path.relative_to(absolute_base_path)) return json_object def write(self, base_file_path: Path, export_path: Path): @@ -80,8 +111,7 @@ def write(self, base_file_path: Path, export_path: Path): data=data, ) """ save annotations in json""" - json_string = self.annotation_store.to_json_string() - json_object = self.covert_to_relative_path(json_string, export_path) + pecha_json = self.convert_absolute_to_relative_path(export_path) """ add four uuid digits to the layer file name for uniqueness""" layer_dir = base_file_path.parent.parent / "layers" / base_file_path.stem layer_file_path = ( @@ -91,4 +121,4 @@ def write(self, base_file_path: Path, export_path: Path): layer_file_path, "w", ) as f: - f.write(json.dumps(json_object, indent=4, ensure_ascii=False)) + f.write(json.dumps(pecha_json, indent=4, ensure_ascii=False)) From fdb37abf47c530c8a442a3b6ff9da64ab91befe5 Mon Sep 17 00:00:00 2001 From: Tenzin Date: Thu, 4 Jul 2024 16:32:54 +0530 Subject: [PATCH 14/49] Pecha classmethod from_path --- src/openpecha/pecha/__init__.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/src/openpecha/pecha/__init__.py b/src/openpecha/pecha/__init__.py index 77b2e18..f0322a4 100644 --- a/src/openpecha/pecha/__init__.py +++ b/src/openpecha/pecha/__init__.py @@ -21,8 +21,25 @@ def __init__( self.metadata = metadata @classmethod - def from_path(cls, path: str): - pass + def from_path(cls, base_path: Path): + pecha_id = base_path.stem + pecha = Pecha(pecha_id=pecha_id) + + with open(base_path / "metadata.json", encoding="utf-8") as f: + metadata = json.load(f) + pecha.set_metadata(metadata) + + for base_file in (base_path / "base").rglob("*.txt"): + base_text = base_file.read_text(encoding="utf-8") + pecha.set_base_file(base_file.stem, base_text) + + for layer_dir in (base_path / "layers").iterdir(): + for layer_file in layer_dir.glob("*.json"): + layer = Layer.from_path(layer_file) + layer_key = (layer.annotation_label, layer_file.stem) + pecha.set_layer(layer_dir.stem, layer_key, layer) + + return pecha @classmethod def from_id(cls, pecha_id: str): From a28eb5cd77f7b54598cb40721d60ae2805d84df4 Mon Sep 17 00:00:00 2001 From: Tenzin Date: Fri, 5 Jul 2024 11:51:32 +0530 Subject: [PATCH 15/49] fix/stam function to_json_string --- src/openpecha/pecha/layer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/openpecha/pecha/layer.py b/src/openpecha/pecha/layer.py index 241b5b2..e914acc 100644 --- a/src/openpecha/pecha/layer.py +++ b/src/openpecha/pecha/layer.py @@ -66,7 +66,7 @@ def set_annotation(self, annotation: Annotation, annotation_id=None): def convert_absolute_to_relative_path(self, absolute_base_path: Path): """call before saving the stam in json""" - json_string = self.annotation_store.to_json() + json_string = self.annotation_store.to_json_string() json_object = json.loads(json_string) for resource in json_object["resources"]: original_path = Path(resource["@include"]) From ee9cc07fd1bb099624063d313f923be985ad648f Mon Sep 17 00:00:00 2001 From: Tenzin Date: Fri, 5 Jul 2024 11:57:53 +0530 Subject: [PATCH 16/49] modify/annotation_label -> annotation_type --- src/openpecha/alignment/parsers/plaintext.py | 8 ++++---- src/openpecha/pecha/layer.py | 10 +++++----- tests/alignment/parsers/plaintext/test_plaintext.py | 4 ++-- tests/pecha/write/test_pecha.py | 2 +- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/openpecha/alignment/parsers/plaintext.py b/src/openpecha/alignment/parsers/plaintext.py index 1ea8ded..e151127 100644 --- a/src/openpecha/alignment/parsers/plaintext.py +++ b/src/openpecha/alignment/parsers/plaintext.py @@ -19,9 +19,9 @@ def from_files(cls, source_path: Path, target_path: Path, metadata: dict): target_text = target_path.read_text(encoding="utf-8") return cls(source_text, target_text, metadata) - def create_pecha_layer(self, segments: List[str], annotation_label: LayerEnum): + def create_pecha_layer(self, segments: List[str], annotation_type: LayerEnum): """ """ - layer = Layer(annotation_label=annotation_label, annotations={}) + layer = Layer(annotation_type=annotation_type, annotations={}) char_count = 0 for segment in segments: annotation = Annotation( @@ -46,8 +46,8 @@ def parse(self): source_pecha.set_base_file(source_base_fname, self.source_text) target_pecha.set_base_file(target_base_fname, self.target_text) - source_annotation = LayerEnum(self.metadata["source"]["annotation_label"]) - target_annotation = LayerEnum(self.metadata["target"]["annotation_label"]) + source_annotation = LayerEnum(self.metadata["source"]["annotation_type"]) + target_annotation = LayerEnum(self.metadata["target"]["annotation_type"]) source_pecha.set_layer( source_base_fname, diff --git a/src/openpecha/pecha/layer.py b/src/openpecha/pecha/layer.py index 60012cc..62e6b97 100644 --- a/src/openpecha/pecha/layer.py +++ b/src/openpecha/pecha/layer.py @@ -25,8 +25,8 @@ def get_annotation_category(layer_label: LayerEnum) -> LayerGroupEnum: class Layer: - def __init__(self, annotation_label: LayerEnum, annotations: Dict[str, Annotation]): - self.annotation_label = annotation_label + def __init__(self, annotation_type: LayerEnum, annotations: Dict[str, Annotation]): + self.annotation_type = annotation_type self.annotations = annotations def set_annotation(self, annotation: Annotation, annotation_id=None): @@ -50,7 +50,7 @@ def write(self, base_file_path: Path, export_path: Path): id=base_file_path.name, filename=base_file_path.as_posix() ) self.dataset = self.annotation_store.add_dataset(id=PECHA_DATASET_ID) - annotation_category = get_annotation_category(self.annotation_label).value + annotation_category = get_annotation_category(self.annotation_type).value self.dataset.add_key(annotation_category) unique_annotation_data_id = get_uuid() base_text = self.base_file_path.read_text(encoding="utf-8") @@ -70,7 +70,7 @@ def write(self, base_file_path: Path, export_path: Path): { "id": unique_annotation_data_id, "key": annotation_category, - "value": self.annotation_label.value, + "value": self.annotation_type.value, "set": self.dataset.id(), } ] @@ -85,7 +85,7 @@ def write(self, base_file_path: Path, export_path: Path): """ add four uuid digits to the layer file name for uniqueness""" layer_dir = base_file_path.parent.parent / "layers" / base_file_path.stem layer_file_path = ( - layer_dir / f"{self.annotation_label.value}-{get_uuid()[:4]}.json" + layer_dir / f"{self.annotation_type.value}-{get_uuid()[:4]}.json" ) with open( layer_file_path, diff --git a/tests/alignment/parsers/plaintext/test_plaintext.py b/tests/alignment/parsers/plaintext/test_plaintext.py index 78084fb..91e6a09 100644 --- a/tests/alignment/parsers/plaintext/test_plaintext.py +++ b/tests/alignment/parsers/plaintext/test_plaintext.py @@ -11,10 +11,10 @@ def get_data_dir(): def get_metadata(): return { "source": { - "annotation_label": "Segment", + "annotation_type": "Segment", }, "target": { - "annotation_label": "Comment", + "annotation_type": "Comment", }, } diff --git a/tests/pecha/write/test_pecha.py b/tests/pecha/write/test_pecha.py index 52ba75d..d38caf5 100644 --- a/tests/pecha/write/test_pecha.py +++ b/tests/pecha/write/test_pecha.py @@ -14,7 +14,7 @@ def get_data_dir(): def get_metadata(): return { - "annotation_label": "Segment", + "annotation_type": "Segment", } From 3a155affa5e44b0aa01222e7172d023495544eea Mon Sep 17 00:00:00 2001 From: Tenzin Date: Fri, 5 Jul 2024 12:05:31 +0530 Subject: [PATCH 17/49] fix/passing empty dict --- pyproject.toml | 1 + src/openpecha/alignment/parsers/plaintext.py | 2 +- src/openpecha/pecha/layer.py | 7 ++++++- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 8f147a9..b0a336a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,6 +20,7 @@ classifiers = [ dependencies = [ "pydantic >= 2.7.4", "stam == 0.8.2", + "collection >= 0.1.6", ] diff --git a/src/openpecha/alignment/parsers/plaintext.py b/src/openpecha/alignment/parsers/plaintext.py index e151127..dd23a46 100644 --- a/src/openpecha/alignment/parsers/plaintext.py +++ b/src/openpecha/alignment/parsers/plaintext.py @@ -21,7 +21,7 @@ def from_files(cls, source_path: Path, target_path: Path, metadata: dict): def create_pecha_layer(self, segments: List[str], annotation_type: LayerEnum): """ """ - layer = Layer(annotation_type=annotation_type, annotations={}) + layer = Layer(annotation_type=annotation_type) char_count = 0 for segment in segments: annotation = Annotation( diff --git a/src/openpecha/pecha/layer.py b/src/openpecha/pecha/layer.py index 62e6b97..b490720 100644 --- a/src/openpecha/pecha/layer.py +++ b/src/openpecha/pecha/layer.py @@ -1,4 +1,5 @@ import json +from collections import defaultdict from enum import Enum from pathlib import Path from typing import Dict @@ -25,7 +26,11 @@ def get_annotation_category(layer_label: LayerEnum) -> LayerGroupEnum: class Layer: - def __init__(self, annotation_type: LayerEnum, annotations: Dict[str, Annotation]): + def __init__( + self, + annotation_type: LayerEnum, + annotations: Dict[str, Annotation] = defaultdict(), + ): self.annotation_type = annotation_type self.annotations = annotations From 1b07070644e86ce69b8cedfb0fd871afc35a6a7f Mon Sep 17 00:00:00 2001 From: Tenzin Date: Fri, 5 Jul 2024 12:27:12 +0530 Subject: [PATCH 18/49] delete segment attribute from Annotation class --- src/openpecha/alignment/parsers/plaintext.py | 1 - src/openpecha/pecha/annotation.py | 1 - src/openpecha/pecha/layer.py | 8 -------- tests/pecha/write/test_pecha.py | 5 +++-- 4 files changed, 3 insertions(+), 12 deletions(-) diff --git a/src/openpecha/alignment/parsers/plaintext.py b/src/openpecha/alignment/parsers/plaintext.py index dd23a46..a968cfc 100644 --- a/src/openpecha/alignment/parsers/plaintext.py +++ b/src/openpecha/alignment/parsers/plaintext.py @@ -25,7 +25,6 @@ def create_pecha_layer(self, segments: List[str], annotation_type: LayerEnum): char_count = 0 for segment in segments: annotation = Annotation( - segment=segment, start=char_count, end=char_count + len(segment), ) diff --git a/src/openpecha/pecha/annotation.py b/src/openpecha/pecha/annotation.py index c7f37c8..f700849 100644 --- a/src/openpecha/pecha/annotation.py +++ b/src/openpecha/pecha/annotation.py @@ -2,7 +2,6 @@ class Annotation(BaseModel): - segment: str start: int = Field(ge=0) end: int = Field(ge=0) metadata: dict = Field(default_factory=dict) diff --git a/src/openpecha/pecha/layer.py b/src/openpecha/pecha/layer.py index b490720..cea03a0 100644 --- a/src/openpecha/pecha/layer.py +++ b/src/openpecha/pecha/layer.py @@ -58,15 +58,7 @@ def write(self, base_file_path: Path, export_path: Path): annotation_category = get_annotation_category(self.annotation_type).value self.dataset.add_key(annotation_category) unique_annotation_data_id = get_uuid() - base_text = self.base_file_path.read_text(encoding="utf-8") for annotation_id, annotation in self.annotations.items(): - if ( - annotation.segment - != base_text[annotation.start : annotation.end] # noqa - ): - raise ValueError( - f"Annotation segment does not match the base text at {annotation_id}" - ) target = Selector.textselector( self.resource, Offset.simple(annotation.start, annotation.end), diff --git a/tests/pecha/write/test_pecha.py b/tests/pecha/write/test_pecha.py index d38caf5..687c690 100644 --- a/tests/pecha/write/test_pecha.py +++ b/tests/pecha/write/test_pecha.py @@ -35,13 +35,11 @@ def get_layer(): def get_annotations(): return { "f2b056668a0c4ad3a085bdcd8e2d7adb": Annotation( - segment="རྒྱ་གར་སྐད་དུ། བོ་དྷི་སཏྭ་ཙརྱ་ཨ་བ་ཏཱ་ར།", start=0, end=39, metadata={}, ), "b696df2dbe314e8a87881a2bc391d0d5": Annotation( - segment="བོད་སྐད་དུ། བྱང་ཆུབ་སེམས་དཔའི་སྤྱོད་པ་ལ་འཇུག་པའི་ལེགས་པར་སྦྱར་བ།", start=39, end=103, metadata={}, @@ -66,3 +64,6 @@ def test_pecha_write(): """ clean up """ rmtree(export_path) + + +test_pecha_write() From 8f525ce3b57ee0d228e68e225af37e452a502f4f Mon Sep 17 00:00:00 2001 From: Tenzin Date: Fri, 5 Jul 2024 12:33:09 +0530 Subject: [PATCH 19/49] modify/set basefile name in set_base_file function --- src/openpecha/alignment/parsers/plaintext.py | 11 +++++------ src/openpecha/pecha/__init__.py | 9 +++++---- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/openpecha/alignment/parsers/plaintext.py b/src/openpecha/alignment/parsers/plaintext.py index a968cfc..b4b5cce 100644 --- a/src/openpecha/alignment/parsers/plaintext.py +++ b/src/openpecha/alignment/parsers/plaintext.py @@ -1,7 +1,7 @@ from pathlib import Path from typing import List -from openpecha.ids import get_initial_pecha_id, get_uuid +from openpecha.ids import get_initial_pecha_id from openpecha.pecha import Pecha from openpecha.pecha.annotation import Annotation from openpecha.pecha.layer import Layer, LayerEnum @@ -41,20 +41,19 @@ def parse(self): source_pecha = Pecha(source_pecha_id) target_pecha = Pecha(target_pecha_id) - source_base_fname, target_base_fname = get_uuid(), get_uuid() - source_pecha.set_base_file(source_base_fname, self.source_text) - target_pecha.set_base_file(target_base_fname, self.target_text) + source_base_name = source_pecha.set_base_file(self.source_text) + target_base_name = target_pecha.set_base_file(self.target_text) source_annotation = LayerEnum(self.metadata["source"]["annotation_type"]) target_annotation = LayerEnum(self.metadata["target"]["annotation_type"]) source_pecha.set_layer( - source_base_fname, + source_base_name, (source_annotation, None), self.create_pecha_layer(self.source_text.split("\n"), source_annotation), ) target_pecha.set_layer( - target_base_fname, + target_base_name, (target_annotation, None), self.create_pecha_layer(self.target_text.split("\n"), target_annotation), ) diff --git a/src/openpecha/pecha/__init__.py b/src/openpecha/pecha/__init__.py index 77b2e18..d72cc13 100644 --- a/src/openpecha/pecha/__init__.py +++ b/src/openpecha/pecha/__init__.py @@ -1,4 +1,5 @@ import json +from collections import defaultdict from pathlib import Path from typing import Dict, Optional, Tuple @@ -11,7 +12,7 @@ class Pecha: def __init__( self, pecha_id: str, - bases: Dict[str, str] = None, + bases: Dict[str, str] = defaultdict(), layers: Dict[str, Dict[Tuple[LayerEnum, str], Layer]] = None, metadata: Dict[str, str] = None, ) -> None: @@ -28,10 +29,10 @@ def from_path(cls, path: str): def from_id(cls, pecha_id: str): pass - def set_base_file(self, base_file_name: str, base_text: str): - if not self.bases: - self.bases = {} + def set_base_file(self, base_text: str) -> str: + base_file_name = get_uuid() self.bases[base_file_name] = base_text + return base_file_name def set_layer( self, layer_dir: str, layer_key: Tuple[LayerEnum, Optional[str]], layer: Layer From 3218572740f38b92ec5d89ed4ebc87debf5989f0 Mon Sep 17 00:00:00 2001 From: Tenzin Date: Fri, 5 Jul 2024 12:35:08 +0530 Subject: [PATCH 20/49] modify layer_label -> layer_type --- src/openpecha/pecha/layer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/openpecha/pecha/layer.py b/src/openpecha/pecha/layer.py index cea03a0..9a0676b 100644 --- a/src/openpecha/pecha/layer.py +++ b/src/openpecha/pecha/layer.py @@ -20,8 +20,10 @@ class LayerGroupEnum(Enum): structure_type = "Structure Type" -def get_annotation_category(layer_label: LayerEnum) -> LayerGroupEnum: +def get_annotation_category(layer_type: LayerEnum) -> LayerGroupEnum: """return the annotation category for the layer label""" + if layer_type == LayerEnum.segment: + return LayerGroupEnum.structure_type return LayerGroupEnum.structure_type From 4ca26d009121a35625069465181cf9ae73630bbc Mon Sep 17 00:00:00 2001 From: Tenzin Date: Fri, 5 Jul 2024 15:24:15 +0530 Subject: [PATCH 21/49] modify/export_path -> output_path --- src/openpecha/pecha/__init__.py | 6 +++--- src/openpecha/pecha/layer.py | 8 ++++---- tests/pecha/write/test_pecha.py | 17 +++++++---------- 3 files changed, 14 insertions(+), 17 deletions(-) diff --git a/src/openpecha/pecha/__init__.py b/src/openpecha/pecha/__init__.py index d72cc13..52ef168 100644 --- a/src/openpecha/pecha/__init__.py +++ b/src/openpecha/pecha/__init__.py @@ -55,9 +55,9 @@ def set_metadata(self, metadata: Dict[str, str]): for key, value in metadata.items(): self.metadata[key] = value - def write(self, export_path: Path = PECHAS_PATH): + def write(self, output_path: Path = PECHAS_PATH): - pecha_dir = _mkdir(export_path / self.pecha_id) + pecha_dir = _mkdir(output_path / self.pecha_id) self.base_path = _mkdir(pecha_dir / f"{self.pecha_id}.opf") """ write metadata """ self.metadata_fn = self.base_path / "metadata.json" @@ -79,5 +79,5 @@ def write(self, export_path: Path = PECHAS_PATH): _mkdir(layer_dir / layer_fname) layer.write( base_file_path=base_dir / f"{layer_fname}.txt", - export_path=export_path, + output_path=output_path, ) diff --git a/src/openpecha/pecha/layer.py b/src/openpecha/pecha/layer.py index 9a0676b..464c984 100644 --- a/src/openpecha/pecha/layer.py +++ b/src/openpecha/pecha/layer.py @@ -41,15 +41,15 @@ def set_annotation(self, annotation: Annotation, annotation_id=None): annotation_id = get_uuid() self.annotations[annotation_id] = annotation - def covert_to_relative_path(self, json_string: str, export_path: Path): + def covert_to_relative_path(self, json_string: str, output_path: Path): """convert the absolute path to relative path for base file path in json string""" json_object = json.loads(json_string) for resource in json_object["resources"]: original_path = Path(resource["@include"]) - resource["@include"] = str(original_path.relative_to(export_path)) + resource["@include"] = str(original_path.relative_to(output_path)) return json_object - def write(self, base_file_path: Path, export_path: Path): + def write(self, base_file_path: Path, output_path: Path): self.base_file_path = base_file_path """write annotations in stam data model""" self.annotation_store = AnnotationStore(id=PECHA_ANNOTATION_STORE_ID) @@ -80,7 +80,7 @@ def write(self, base_file_path: Path, export_path: Path): ) """ save annotations in json""" json_string = self.annotation_store.to_json_string() - json_object = self.covert_to_relative_path(json_string, export_path) + json_object = self.covert_to_relative_path(json_string, output_path) """ add four uuid digits to the layer file name for uniqueness""" layer_dir = base_file_path.parent.parent / "layers" / base_file_path.stem layer_file_path = ( diff --git a/tests/pecha/write/test_pecha.py b/tests/pecha/write/test_pecha.py index 687c690..2b4be16 100644 --- a/tests/pecha/write/test_pecha.py +++ b/tests/pecha/write/test_pecha.py @@ -7,9 +7,9 @@ def get_data_dir(): - export_path = Path(__file__).parent / "output" - export_path.mkdir(parents=True, exist_ok=True) - return export_path + output_path = Path(__file__).parent / "output" + output_path.mkdir(parents=True, exist_ok=True) + return output_path def get_metadata(): @@ -51,19 +51,16 @@ def test_pecha_write(): pecha_id = "IE7D6875F" base = get_base() layer = get_layer() - export_path = get_data_dir() + output_path = get_data_dir() expected_output_path = Path(__file__).parent / "expected_output" pecha = Pecha(pecha_id=pecha_id, bases=base, layers=layer, metadata=get_metadata()) - pecha.write(export_path=export_path) + pecha.write(output_path=output_path) - output_file_names = [file.name for file in export_path.rglob("*")].sort() + output_file_names = [file.name for file in output_path.rglob("*")].sort() expected_file_names = [file.name for file in expected_output_path.rglob("*")].sort() assert output_file_names == expected_file_names """ clean up """ - rmtree(export_path) - - -test_pecha_write() + rmtree(output_path) From 5ca3fcb2e7f646a0c1de91236fcaa9936fa74133 Mon Sep 17 00:00:00 2001 From: Tenzin Date: Fri, 5 Jul 2024 15:26:53 +0530 Subject: [PATCH 22/49] modify/base_fname -> base_name --- src/openpecha/pecha/__init__.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/openpecha/pecha/__init__.py b/src/openpecha/pecha/__init__.py index 52ef168..5f4b881 100644 --- a/src/openpecha/pecha/__init__.py +++ b/src/openpecha/pecha/__init__.py @@ -68,16 +68,16 @@ def write(self, output_path: Path = PECHAS_PATH): """ write base file""" if self.bases: base_dir = _mkdir(self.base_path / "base") - for base_fname, base_text in self.bases.items(): - base_fn = base_dir / f"{base_fname}.txt" + for base_name, base_text in self.bases.items(): + base_fn = base_dir / f"{base_name}.txt" base_fn.write_text(base_text, encoding="utf-8") if self.layers: layer_dir = _mkdir(self.base_path / "layers") """ write annotation layers""" - for layer_fname, layer_data in self.layers.items(): + for layer_name, layer_data in self.layers.items(): for _, layer in layer_data.items(): - _mkdir(layer_dir / layer_fname) + _mkdir(layer_dir / layer_name) layer.write( - base_file_path=base_dir / f"{layer_fname}.txt", + base_file_path=base_dir / f"{layer_name}.txt", output_path=output_path, ) From 7c9663f2f6dc19782f4250802e75a393c13c11fc Mon Sep 17 00:00:00 2001 From: Tenzin Date: Fri, 5 Jul 2024 15:42:13 +0530 Subject: [PATCH 23/49] add id_ attribute to Annotation class --- src/openpecha/pecha/annotation.py | 3 +++ src/openpecha/pecha/layer.py | 6 ++---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/openpecha/pecha/annotation.py b/src/openpecha/pecha/annotation.py index f700849..577cddd 100644 --- a/src/openpecha/pecha/annotation.py +++ b/src/openpecha/pecha/annotation.py @@ -1,7 +1,10 @@ from pydantic import BaseModel, Field, ValidationInfo, field_validator +from openpecha.ids import get_uuid + class Annotation(BaseModel): + id_: str = Field(default_factory=get_uuid) start: int = Field(ge=0) end: int = Field(ge=0) metadata: dict = Field(default_factory=dict) diff --git a/src/openpecha/pecha/layer.py b/src/openpecha/pecha/layer.py index 464c984..f254234 100644 --- a/src/openpecha/pecha/layer.py +++ b/src/openpecha/pecha/layer.py @@ -36,10 +36,8 @@ def __init__( self.annotation_type = annotation_type self.annotations = annotations - def set_annotation(self, annotation: Annotation, annotation_id=None): - if not annotation_id: - annotation_id = get_uuid() - self.annotations[annotation_id] = annotation + def set_annotation(self, annotation: Annotation): + self.annotations[annotation.id_] = annotation def covert_to_relative_path(self, json_string: str, output_path: Path): """convert the absolute path to relative path for base file path in json string""" From a606ea05840ef4b803a34d948ac6469e551634f5 Mon Sep 17 00:00:00 2001 From: Tenzin Date: Fri, 5 Jul 2024 16:21:33 +0530 Subject: [PATCH 24/49] add id_ attribute to Layer class --- src/openpecha/alignment/parsers/plaintext.py | 11 +++---- src/openpecha/ids.py | 4 +++ src/openpecha/pecha/__init__.py | 21 ++++++------ src/openpecha/pecha/layer.py | 34 ++++++++++---------- tests/pecha/write/test_pecha.py | 17 ++++++++-- 5 files changed, 48 insertions(+), 39 deletions(-) diff --git a/src/openpecha/alignment/parsers/plaintext.py b/src/openpecha/alignment/parsers/plaintext.py index b4b5cce..80240f2 100644 --- a/src/openpecha/alignment/parsers/plaintext.py +++ b/src/openpecha/alignment/parsers/plaintext.py @@ -44,18 +44,15 @@ def parse(self): source_base_name = source_pecha.set_base_file(self.source_text) target_base_name = target_pecha.set_base_file(self.target_text) - source_annotation = LayerEnum(self.metadata["source"]["annotation_type"]) - target_annotation = LayerEnum(self.metadata["target"]["annotation_type"]) - source_pecha.set_layer( source_base_name, - (source_annotation, None), - self.create_pecha_layer(self.source_text.split("\n"), source_annotation), + LayerEnum.segment, + self.create_pecha_layer(self.source_text.split("\n"), LayerEnum.segment), ) target_pecha.set_layer( target_base_name, - (target_annotation, None), - self.create_pecha_layer(self.target_text.split("\n"), target_annotation), + LayerEnum.segment, + self.create_pecha_layer(self.target_text.split("\n"), LayerEnum.segment), ) source_pecha.set_metadata(self.metadata["source"]) diff --git a/src/openpecha/ids.py b/src/openpecha/ids.py index b27d246..9560ae5 100644 --- a/src/openpecha/ids.py +++ b/src/openpecha/ids.py @@ -6,6 +6,10 @@ def get_uuid(): return uuid4().hex +def get_fourchar_uuid(): + return get_uuid()[:4] + + def get_id(prefix, length): return prefix + "".join(random.choices(uuid4().hex, k=length)).upper() diff --git a/src/openpecha/pecha/__init__.py b/src/openpecha/pecha/__init__.py index 5f4b881..9a1c523 100644 --- a/src/openpecha/pecha/__init__.py +++ b/src/openpecha/pecha/__init__.py @@ -13,7 +13,9 @@ def __init__( self, pecha_id: str, bases: Dict[str, str] = defaultdict(), - layers: Dict[str, Dict[Tuple[LayerEnum, str], Layer]] = None, + layers: Dict[str, Dict[Tuple[LayerEnum, str], Layer]] = defaultdict( + lambda: defaultdict() + ), metadata: Dict[str, str] = None, ) -> None: self.pecha_id = pecha_id @@ -35,19 +37,14 @@ def set_base_file(self, base_text: str) -> str: return base_file_name def set_layer( - self, layer_dir: str, layer_key: Tuple[LayerEnum, Optional[str]], layer: Layer - ): - """Note layer dir should be same as its corresponding base file name""" - if not self.layers: - self.layers = {} - if layer_dir not in self.layers: - self.layers[layer_dir] = {} + self, layer_dir: str, annotation_type: LayerEnum, layer: Layer + ) -> str: - """ layer key is a tuple of layer label and layer id""" + """layer key is a tuple of layer label and layer id""" """ A particular volume can have multiple layers with same label but different id""" - layer_label, layer_id = layer_key - layer_id = layer_id if layer_id else get_uuid() - self.layers[layer_dir][(layer_label, layer_id)] = layer + layer_id = get_uuid()[:4] + self.layers[layer_dir][(annotation_type, layer_id)] = layer + return layer_id def set_metadata(self, metadata: Dict[str, str]): if not self.metadata: diff --git a/src/openpecha/pecha/layer.py b/src/openpecha/pecha/layer.py index f254234..9183298 100644 --- a/src/openpecha/pecha/layer.py +++ b/src/openpecha/pecha/layer.py @@ -2,12 +2,13 @@ from collections import defaultdict from enum import Enum from pathlib import Path -from typing import Dict +from typing import Dict, Optional -from stam import AnnotationStore, Offset, Selector +from pydantic import BaseModel, ConfigDict, Field +from stam import AnnotationDataSet, AnnotationStore, Offset, Selector from openpecha.config import PECHA_ANNOTATION_STORE_ID, PECHA_DATASET_ID -from openpecha.ids import get_uuid +from openpecha.ids import get_fourchar_uuid, get_uuid from openpecha.pecha.annotation import Annotation @@ -27,14 +28,15 @@ def get_annotation_category(layer_type: LayerEnum) -> LayerGroupEnum: return LayerGroupEnum.structure_type -class Layer: - def __init__( - self, - annotation_type: LayerEnum, - annotations: Dict[str, Annotation] = defaultdict(), - ): - self.annotation_type = annotation_type - self.annotations = annotations +class Layer(BaseModel): + id_: str = Field(default_factory=get_fourchar_uuid) + annotation_type: LayerEnum + annotations: Dict[str, Annotation] = defaultdict() + + annotation_store: Optional[AnnotationStore] = None + dataset: Optional[AnnotationDataSet] = None + + model_config = ConfigDict(arbitrary_types_allowed=True) def set_annotation(self, annotation: Annotation): self.annotations[annotation.id_] = annotation @@ -48,10 +50,10 @@ def covert_to_relative_path(self, json_string: str, output_path: Path): return json_object def write(self, base_file_path: Path, output_path: Path): - self.base_file_path = base_file_path + base_file_path = base_file_path """write annotations in stam data model""" self.annotation_store = AnnotationStore(id=PECHA_ANNOTATION_STORE_ID) - self.resource = self.annotation_store.add_resource( + resource = self.annotation_store.add_resource( id=base_file_path.name, filename=base_file_path.as_posix() ) self.dataset = self.annotation_store.add_dataset(id=PECHA_DATASET_ID) @@ -60,7 +62,7 @@ def write(self, base_file_path: Path, output_path: Path): unique_annotation_data_id = get_uuid() for annotation_id, annotation in self.annotations.items(): target = Selector.textselector( - self.resource, + resource, Offset.simple(annotation.start, annotation.end), ) data = [ @@ -81,9 +83,7 @@ def write(self, base_file_path: Path, output_path: Path): json_object = self.covert_to_relative_path(json_string, output_path) """ add four uuid digits to the layer file name for uniqueness""" layer_dir = base_file_path.parent.parent / "layers" / base_file_path.stem - layer_file_path = ( - layer_dir / f"{self.annotation_type.value}-{get_uuid()[:4]}.json" - ) + layer_file_path = layer_dir / f"{self.annotation_type.value}-{self.id_}.json" with open( layer_file_path, "w", diff --git a/tests/pecha/write/test_pecha.py b/tests/pecha/write/test_pecha.py index 2b4be16..ae65eaf 100644 --- a/tests/pecha/write/test_pecha.py +++ b/tests/pecha/write/test_pecha.py @@ -27,7 +27,11 @@ def get_base(): def get_layer(): return { "f2b056668a0c4ad3a085bdcd8e2d7adb": { - LayerEnum("Segment"): Layer(LayerEnum("Segment"), get_annotations()) + (LayerEnum.segment, "bf13"): Layer( + id_="bf13", + annotation_type=LayerEnum("Segment"), + annotations=get_annotations(), + ) } } @@ -57,10 +61,17 @@ def test_pecha_write(): pecha = Pecha(pecha_id=pecha_id, bases=base, layers=layer, metadata=get_metadata()) pecha.write(output_path=output_path) - output_file_names = [file.name for file in output_path.rglob("*")].sort() - expected_file_names = [file.name for file in expected_output_path.rglob("*")].sort() + output_file_names = [file.name for file in list(output_path.rglob("*"))] + expected_file_names = [file.name for file in list(expected_output_path.rglob("*"))] + + """ sort the list """ + output_file_names.sort() + expected_file_names.sort() assert output_file_names == expected_file_names """ clean up """ rmtree(output_path) + + +test_pecha_write() From 869d207f8b62883176187c956d1431bfd45a78e6 Mon Sep 17 00:00:00 2001 From: Tenzin Date: Fri, 5 Jul 2024 16:23:33 +0530 Subject: [PATCH 25/49] modify/ layer_dir -> base_name --- src/openpecha/pecha/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/openpecha/pecha/__init__.py b/src/openpecha/pecha/__init__.py index 9a1c523..9975839 100644 --- a/src/openpecha/pecha/__init__.py +++ b/src/openpecha/pecha/__init__.py @@ -37,13 +37,13 @@ def set_base_file(self, base_text: str) -> str: return base_file_name def set_layer( - self, layer_dir: str, annotation_type: LayerEnum, layer: Layer + self, base_name: str, annotation_type: LayerEnum, layer: Layer ) -> str: """layer key is a tuple of layer label and layer id""" """ A particular volume can have multiple layers with same label but different id""" layer_id = get_uuid()[:4] - self.layers[layer_dir][(annotation_type, layer_id)] = layer + self.layers[base_name][(annotation_type, layer_id)] = layer return layer_id def set_metadata(self, metadata: Dict[str, str]): From 3b6f1b8086ccf01ee272efc994d07e30406e9d5c Mon Sep 17 00:00:00 2001 From: Tenzin Date: Fri, 5 Jul 2024 16:26:33 +0530 Subject: [PATCH 26/49] modify layer_id -> layer_subtype_id --- src/openpecha/pecha/__init__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/openpecha/pecha/__init__.py b/src/openpecha/pecha/__init__.py index 9975839..25f2f2a 100644 --- a/src/openpecha/pecha/__init__.py +++ b/src/openpecha/pecha/__init__.py @@ -42,9 +42,9 @@ def set_layer( """layer key is a tuple of layer label and layer id""" """ A particular volume can have multiple layers with same label but different id""" - layer_id = get_uuid()[:4] - self.layers[base_name][(annotation_type, layer_id)] = layer - return layer_id + layer_subtype_id = get_uuid()[:4] + self.layers[base_name][(annotation_type, layer_subtype_id)] = layer + return layer_subtype_id def set_metadata(self, metadata: Dict[str, str]): if not self.metadata: From 0f034d9528524111e8c8cce0518e5cd34038fb35 Mon Sep 17 00:00:00 2001 From: Tenzin Date: Mon, 8 Jul 2024 07:37:30 +0530 Subject: [PATCH 27/49] create PechaMetadata --- src/openpecha/pecha/__init__.py | 9 ++---- src/openpecha/pecha/metadata.py | 52 +++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+), 7 deletions(-) create mode 100644 src/openpecha/pecha/metadata.py diff --git a/src/openpecha/pecha/__init__.py b/src/openpecha/pecha/__init__.py index 25f2f2a..a531e3a 100644 --- a/src/openpecha/pecha/__init__.py +++ b/src/openpecha/pecha/__init__.py @@ -6,6 +6,7 @@ from openpecha.config import PECHAS_PATH, _mkdir from openpecha.ids import get_uuid from openpecha.pecha.layer import Layer, LayerEnum +from openpecha.pecha.metadata import PechaMetadata class Pecha: @@ -16,7 +17,7 @@ def __init__( layers: Dict[str, Dict[Tuple[LayerEnum, str], Layer]] = defaultdict( lambda: defaultdict() ), - metadata: Dict[str, str] = None, + metadata: PechaMetadata = None, ) -> None: self.pecha_id = pecha_id self.bases = bases @@ -46,12 +47,6 @@ def set_layer( self.layers[base_name][(annotation_type, layer_subtype_id)] = layer return layer_subtype_id - def set_metadata(self, metadata: Dict[str, str]): - if not self.metadata: - self.metadata = {} - for key, value in metadata.items(): - self.metadata[key] = value - def write(self, output_path: Path = PECHAS_PATH): pecha_dir = _mkdir(output_path / self.pecha_id) diff --git a/src/openpecha/pecha/metadata.py b/src/openpecha/pecha/metadata.py new file mode 100644 index 0000000..d373cc3 --- /dev/null +++ b/src/openpecha/pecha/metadata.py @@ -0,0 +1,52 @@ +from collections import defaultdict +from datetime import datetime +from enum import Enum +from typing import Dict, List, Optional + +from pydantic import BaseModel, Field, field_validator + +from openpecha.ids import get_diplomatic_id, get_initial_pecha_id, get_open_pecha_id + + +class InitialCreationType(Enum): + ocr = "ocr" + ebook = "ebook" + input = "input" + tmx = "tmx" + + +class PechaMetadata(BaseModel): + id_: str = Field(default=None, alias="id_") + title: List[str] = Field(default=None, alias="title") + author: List[str] = Field(default=None, alias="author") + source: str = Field(default=None, alias="source") + language: str = Field(default=None, alias="language") + initial_creation_type: InitialCreationType = Field( + None, alias="initial_creation_type" + ) + created_at: datetime = Field(default=datetime.now, alias="created_at") + source_metadata: Optional[Dict] = Field( + default=defaultdict + ) # place to dump any metadata from the source + + @field_validator("created_at", pre=True, always=True) + def set_imported_date(cls, v): + return v or datetime.now() + + +class InitialPechaMetadata(PechaMetadata): + @field_validator("id_", pre=True, always=True) + def set_id(cls, v): + return v or get_initial_pecha_id() + + +class OpenPechaMetadata(PechaMetadata): + @field_validator("id_", pre=True, always=True) + def set_id(cls, v): + return v or get_open_pecha_id() + + +class DiplomaticPechaMetadata(PechaMetadata): + @field_validator("id_", pre=True, always=True) + def set_id(cls, v): + return v or get_diplomatic_id() From 3d188ecedd592c83dfee1814ae5910ef1602a384 Mon Sep 17 00:00:00 2001 From: Tenzin Date: Mon, 8 Jul 2024 07:50:23 +0530 Subject: [PATCH 28/49] instate Pecha with metadata --- src/openpecha/alignment/parsers/plaintext.py | 15 ++++++--------- src/openpecha/pecha/__init__.py | 4 ++-- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/src/openpecha/alignment/parsers/plaintext.py b/src/openpecha/alignment/parsers/plaintext.py index 80240f2..aefb594 100644 --- a/src/openpecha/alignment/parsers/plaintext.py +++ b/src/openpecha/alignment/parsers/plaintext.py @@ -1,10 +1,10 @@ from pathlib import Path from typing import List -from openpecha.ids import get_initial_pecha_id from openpecha.pecha import Pecha from openpecha.pecha.annotation import Annotation from openpecha.pecha.layer import Layer, LayerEnum +from openpecha.pecha.metadata import InitialCreationType, InitialPechaMetadata class PlainTextLineAlignedParser: @@ -34,12 +34,12 @@ def create_pecha_layer(self, segments: List[str], annotation_type: LayerEnum): return layer def parse(self): - source_pecha_id, target_pecha_id = ( - get_initial_pecha_id(), - get_initial_pecha_id(), + source_pecha_metadata, target_pecha_metadata = ( + InitialPechaMetadata(initial_creation_type=InitialCreationType.input), + InitialPechaMetadata(initial_creation_type=InitialCreationType.input), ) - source_pecha = Pecha(source_pecha_id) - target_pecha = Pecha(target_pecha_id) + source_pecha = Pecha(metadata=source_pecha_metadata) + target_pecha = Pecha(metadata=target_pecha_metadata) source_base_name = source_pecha.set_base_file(self.source_text) target_base_name = target_pecha.set_base_file(self.target_text) @@ -55,9 +55,6 @@ def parse(self): self.create_pecha_layer(self.target_text.split("\n"), LayerEnum.segment), ) - source_pecha.set_metadata(self.metadata["source"]) - target_pecha.set_metadata(self.metadata["target"]) - return source_pecha, target_pecha # TODO: diff --git a/src/openpecha/pecha/__init__.py b/src/openpecha/pecha/__init__.py index a531e3a..d9a1c91 100644 --- a/src/openpecha/pecha/__init__.py +++ b/src/openpecha/pecha/__init__.py @@ -12,14 +12,14 @@ class Pecha: def __init__( self, - pecha_id: str, + pecha_id: str = None, bases: Dict[str, str] = defaultdict(), layers: Dict[str, Dict[Tuple[LayerEnum, str], Layer]] = defaultdict( lambda: defaultdict() ), metadata: PechaMetadata = None, ) -> None: - self.pecha_id = pecha_id + self.pecha_id = metadata.pecha_id if metadata else pecha_id self.bases = bases self.layers = layers self.metadata = metadata From 15fee96c5b56c882745e75a7817296719b9e2e14 Mon Sep 17 00:00:00 2001 From: Tenzin Date: Mon, 8 Jul 2024 07:55:23 +0530 Subject: [PATCH 29/49] fix field_validator attribute --- src/openpecha/pecha/metadata.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/openpecha/pecha/metadata.py b/src/openpecha/pecha/metadata.py index d373cc3..25ffaac 100644 --- a/src/openpecha/pecha/metadata.py +++ b/src/openpecha/pecha/metadata.py @@ -29,24 +29,24 @@ class PechaMetadata(BaseModel): default=defaultdict ) # place to dump any metadata from the source - @field_validator("created_at", pre=True, always=True) + @field_validator("created_at", mode="before") def set_imported_date(cls, v): return v or datetime.now() class InitialPechaMetadata(PechaMetadata): - @field_validator("id_", pre=True, always=True) + @field_validator("id_", mode="before") def set_id(cls, v): return v or get_initial_pecha_id() class OpenPechaMetadata(PechaMetadata): - @field_validator("id_", pre=True, always=True) + @field_validator("id_", mode="before") def set_id(cls, v): return v or get_open_pecha_id() class DiplomaticPechaMetadata(PechaMetadata): - @field_validator("id_", pre=True, always=True) + @field_validator("id_", mode="before") def set_id(cls, v): return v or get_diplomatic_id() From 753c721c3d126242b837672c7f7af132e095fabf Mon Sep 17 00:00:00 2001 From: Tenzin Date: Mon, 8 Jul 2024 08:28:26 +0530 Subject: [PATCH 30/49] fix pydantic validator --- src/openpecha/pecha/metadata.py | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/src/openpecha/pecha/metadata.py b/src/openpecha/pecha/metadata.py index 25ffaac..41ab20c 100644 --- a/src/openpecha/pecha/metadata.py +++ b/src/openpecha/pecha/metadata.py @@ -3,9 +3,9 @@ from enum import Enum from typing import Dict, List, Optional -from pydantic import BaseModel, Field, field_validator +from pydantic import BaseModel, Field, field_validator, model_validator -from openpecha.ids import get_diplomatic_id, get_initial_pecha_id, get_open_pecha_id +from openpecha.ids import get_initial_pecha_id class InitialCreationType(Enum): @@ -35,18 +35,27 @@ def set_imported_date(cls, v): class InitialPechaMetadata(PechaMetadata): - @field_validator("id_", mode="before") - def set_id(cls, v): - return v or get_initial_pecha_id() + @model_validator(mode="before") + @classmethod + def set_id(cls, values): + if "id_" not in values or values["id_"] is None: + values["id_"] = get_initial_pecha_id() + return values class OpenPechaMetadata(PechaMetadata): - @field_validator("id_", mode="before") - def set_id(cls, v): - return v or get_open_pecha_id() + @model_validator(mode="before") + @classmethod + def set_id(cls, values): + if "id_" not in values or values["id_"] is None: + values["id_"] = get_initial_pecha_id() + return values class DiplomaticPechaMetadata(PechaMetadata): - @field_validator("id_", mode="before") - def set_id(cls, v): - return v or get_diplomatic_id() + @model_validator(mode="before") + @classmethod + def set_id(cls, values): + if "id_" not in values or values["id_"] is None: + values["id_"] = get_initial_pecha_id() + return values From 20cfefbf338772d95734a433e36a449c627e442f Mon Sep 17 00:00:00 2001 From: Tenzin Date: Mon, 8 Jul 2024 08:29:10 +0530 Subject: [PATCH 31/49] pass metadata to Pechadata --- src/openpecha/alignment/parsers/plaintext.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/openpecha/alignment/parsers/plaintext.py b/src/openpecha/alignment/parsers/plaintext.py index aefb594..30c1671 100644 --- a/src/openpecha/alignment/parsers/plaintext.py +++ b/src/openpecha/alignment/parsers/plaintext.py @@ -35,8 +35,14 @@ def create_pecha_layer(self, segments: List[str], annotation_type: LayerEnum): def parse(self): source_pecha_metadata, target_pecha_metadata = ( - InitialPechaMetadata(initial_creation_type=InitialCreationType.input), - InitialPechaMetadata(initial_creation_type=InitialCreationType.input), + InitialPechaMetadata( + initial_creation_type=InitialCreationType.input, + source_metadata=self.metadata["source"], + ), + InitialPechaMetadata( + initial_creation_type=InitialCreationType.input, + source_metadata=self.metadata["target"], + ), ) source_pecha = Pecha(metadata=source_pecha_metadata) target_pecha = Pecha(metadata=target_pecha_metadata) From d7bdb302c0d1b4b6ccf0f76ebeafcb87a477f861 Mon Sep 17 00:00:00 2001 From: Tenzin Date: Mon, 8 Jul 2024 08:31:55 +0530 Subject: [PATCH 32/49] set pecha_id if not in metadata --- src/openpecha/pecha/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/openpecha/pecha/__init__.py b/src/openpecha/pecha/__init__.py index d9a1c91..d8441ff 100644 --- a/src/openpecha/pecha/__init__.py +++ b/src/openpecha/pecha/__init__.py @@ -19,7 +19,7 @@ def __init__( ), metadata: PechaMetadata = None, ) -> None: - self.pecha_id = metadata.pecha_id if metadata else pecha_id + self.pecha_id = metadata.id_ if metadata else pecha_id self.bases = bases self.layers = layers self.metadata = metadata @@ -48,6 +48,8 @@ def set_layer( return layer_subtype_id def write(self, output_path: Path = PECHAS_PATH): + if not self.pecha_id: + raise ValueError("pecha_id must be set before writing.") pecha_dir = _mkdir(output_path / self.pecha_id) self.base_path = _mkdir(pecha_dir / f"{self.pecha_id}.opf") From 2d14de794d89fdae4f013273b538bda567bce279 Mon Sep 17 00:00:00 2001 From: Tenzin Date: Mon, 8 Jul 2024 09:11:54 +0530 Subject: [PATCH 33/49] make PechaMetadata json serializable --- src/openpecha/pecha/__init__.py | 7 +++++-- src/openpecha/pecha/metadata.py | 25 +++++++++++++++++++++++-- 2 files changed, 28 insertions(+), 4 deletions(-) diff --git a/src/openpecha/pecha/__init__.py b/src/openpecha/pecha/__init__.py index d8441ff..27afe0c 100644 --- a/src/openpecha/pecha/__init__.py +++ b/src/openpecha/pecha/__init__.py @@ -6,7 +6,7 @@ from openpecha.config import PECHAS_PATH, _mkdir from openpecha.ids import get_uuid from openpecha.pecha.layer import Layer, LayerEnum -from openpecha.pecha.metadata import PechaMetadata +from openpecha.pecha.metadata import PechaMetadata, to_json_serializable class Pecha: @@ -56,7 +56,10 @@ def write(self, output_path: Path = PECHAS_PATH): """ write metadata """ self.metadata_fn = self.base_path / "metadata.json" self.metadata_fn.write_text( - json.dumps(self.metadata, indent=4, ensure_ascii=False), encoding="utf-8" + json.dumps( + to_json_serializable(self.metadata), indent=4, ensure_ascii=False + ), + encoding="utf-8", ) """ write base file""" diff --git a/src/openpecha/pecha/metadata.py b/src/openpecha/pecha/metadata.py index 41ab20c..a7fa7f0 100644 --- a/src/openpecha/pecha/metadata.py +++ b/src/openpecha/pecha/metadata.py @@ -1,3 +1,4 @@ +import json from collections import defaultdict from datetime import datetime from enum import Enum @@ -24,15 +25,35 @@ class PechaMetadata(BaseModel): initial_creation_type: InitialCreationType = Field( None, alias="initial_creation_type" ) - created_at: datetime = Field(default=datetime.now, alias="created_at") + created_at: datetime = Field(default=None, alias="created_at") source_metadata: Optional[Dict] = Field( - default=defaultdict + default={} ) # place to dump any metadata from the source @field_validator("created_at", mode="before") def set_imported_date(cls, v): return v or datetime.now() + class Config: + json_encoders = { + InitialCreationType: lambda v: v.value, + defaultdict: lambda d: dict(d), + } + + +def to_json_serializable(pecha_metadata: Optional[PechaMetadata]) -> str: + if pecha_metadata is None: + return json.dumps({}, indent=4, ensure_ascii=False) + + # Convert the model to a dictionary + dict_data = pecha_metadata.model_dump() + # Convert the defaultdict to a regular dictionary + dict_data["source_metadata"] = dict(dict_data["source_metadata"]) + # Convert the initial_creation_type enum to its value + if dict_data["initial_creation_type"] is not None: + dict_data["initial_creation_type"] = dict_data["initial_creation_type"].value + return json.dumps(dict_data, indent=4, ensure_ascii=False) + class InitialPechaMetadata(PechaMetadata): @model_validator(mode="before") From 9f7a01a762135a644914bf6f021c7519387f1034 Mon Sep 17 00:00:00 2001 From: Tenzin Date: Mon, 8 Jul 2024 09:13:18 +0530 Subject: [PATCH 34/49] modify test_pecha instantiate pecha with metadata --- tests/pecha/write/test_pecha.py | 51 ++++++++++++++++++++------------- 1 file changed, 31 insertions(+), 20 deletions(-) diff --git a/tests/pecha/write/test_pecha.py b/tests/pecha/write/test_pecha.py index ae65eaf..c905b80 100644 --- a/tests/pecha/write/test_pecha.py +++ b/tests/pecha/write/test_pecha.py @@ -1,9 +1,11 @@ from pathlib import Path from shutil import rmtree +from unittest import mock from openpecha.pecha import Pecha from openpecha.pecha.annotation import Annotation from openpecha.pecha.layer import Layer, LayerEnum +from openpecha.pecha.metadata import InitialCreationType, InitialPechaMetadata def get_data_dir(): @@ -52,26 +54,35 @@ def get_annotations(): def test_pecha_write(): - pecha_id = "IE7D6875F" - base = get_base() - layer = get_layer() - output_path = get_data_dir() - expected_output_path = Path(__file__).parent / "expected_output" - - pecha = Pecha(pecha_id=pecha_id, bases=base, layers=layer, metadata=get_metadata()) - pecha.write(output_path=output_path) - - output_file_names = [file.name for file in list(output_path.rglob("*"))] - expected_file_names = [file.name for file in list(expected_output_path.rglob("*"))] - - """ sort the list """ - output_file_names.sort() - expected_file_names.sort() - - assert output_file_names == expected_file_names - - """ clean up """ - rmtree(output_path) + with mock.patch( + "openpecha.pecha.metadata.get_initial_pecha_id" + ) as mock_get_initial_pecha_id: + mock_get_initial_pecha_id.return_value = "IE7D6875F" + base = get_base() + layer = get_layer() + output_path = get_data_dir() + expected_output_path = Path(__file__).parent / "expected_output" + + metadata = InitialPechaMetadata(initial_creation_type=InitialCreationType.input) + pecha = Pecha(metadata=metadata) + pecha.bases = base + pecha.layers = layer + + pecha.write(output_path=output_path) + + output_file_names = [file.name for file in list(output_path.rglob("*"))] + expected_file_names = [ + file.name for file in list(expected_output_path.rglob("*")) + ] + + """ sort the list """ + output_file_names.sort() + expected_file_names.sort() + + assert output_file_names == expected_file_names + + """ clean up """ + rmtree(output_path) test_pecha_write() From 7c253d2e73b48748c97c904638e3b800bbfaf332 Mon Sep 17 00:00:00 2001 From: Tenzin Date: Mon, 8 Jul 2024 10:12:55 +0530 Subject: [PATCH 35/49] set base and layer from class method from_path Pecha --- src/openpecha/pecha/__init__.py | 29 +++++++++++++++------------ src/openpecha/pecha/layer.py | 35 +++++++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+), 13 deletions(-) diff --git a/src/openpecha/pecha/__init__.py b/src/openpecha/pecha/__init__.py index 99a3137..1a21d40 100644 --- a/src/openpecha/pecha/__init__.py +++ b/src/openpecha/pecha/__init__.py @@ -13,7 +13,7 @@ class Pecha: def __init__( self, pecha_id: str = None, - bases: Dict[str, str] = defaultdict(), + bases: Dict[str, str] = defaultdict(str), layers: Dict[str, Dict[Tuple[LayerEnum, str], Layer]] = defaultdict( lambda: defaultdict() ), @@ -33,15 +33,14 @@ def from_path(cls, base_path: Path): # metadata = json.load(f) # pecha.set_metadata(metadata) - # for base_file in (base_path / "base").rglob("*.txt"): - # base_text = base_file.read_text(encoding="utf-8") - # pecha.set_base_file(base_file.stem, base_text) + for base_file in (base_path / "base").rglob("*"): + base_text = base_file.read_text(encoding="utf-8") + pecha.set_base_file(base_text, base_file.stem) - # for layer_dir in (base_path / "layers").iterdir(): - # for layer_file in layer_dir.glob("*.json"): - # layer = Layer.from_path(layer_file) - # layer_key = (layer.annotation_label, layer_file.stem) - # pecha.set_layer(layer_dir.stem, layer_key, layer) + for layer_dir in (base_path / "layers").iterdir(): + for layer_file in layer_dir.glob("*.json"): + layer = Layer.from_path(layer_file) + pecha.set_layer(layer_dir.stem, layer.annotation_type, layer, layer.id_) return pecha @@ -49,18 +48,22 @@ def from_path(cls, base_path: Path): def from_id(cls, pecha_id: str): pass - def set_base_file(self, base_text: str) -> str: - base_file_name = get_uuid() + def set_base_file(self, base_text: str, base_file_name: str = None) -> str: + base_file_name = base_file_name if base_file_name else get_uuid()[:4] self.bases[base_file_name] = base_text return base_file_name def set_layer( - self, base_name: str, annotation_type: LayerEnum, layer: Layer + self, + base_name: str, + annotation_type: LayerEnum, + layer: Layer, + layer_subtype_id: str = None, ) -> str: """layer key is a tuple of layer label and layer id""" """ A particular volume can have multiple layers with same label but different id""" - layer_subtype_id = get_uuid()[:4] + layer_subtype_id = get_uuid()[:4] if not layer_subtype_id else layer_subtype_id self.layers[base_name][(annotation_type, layer_subtype_id)] = layer return layer_subtype_id diff --git a/src/openpecha/pecha/layer.py b/src/openpecha/pecha/layer.py index 9183298..dc3d060 100644 --- a/src/openpecha/pecha/layer.py +++ b/src/openpecha/pecha/layer.py @@ -28,6 +28,14 @@ def get_annotation_category(layer_type: LayerEnum) -> LayerGroupEnum: return LayerGroupEnum.structure_type +def convert_relative_to_absolute_path(json_data, absolute_base_path: Path): + """call after loading the stam from json""" + for resource in json_data["resources"]: + original_path = Path(resource["@include"]) + resource["@include"] = str(absolute_base_path / original_path) + return json_data + + class Layer(BaseModel): id_: str = Field(default_factory=get_fourchar_uuid) annotation_type: LayerEnum @@ -38,6 +46,33 @@ class Layer(BaseModel): model_config = ConfigDict(arbitrary_types_allowed=True) + @classmethod + def from_path(cls, layer_file_path: Path): + """get annotation label""" + annotation_label = LayerEnum(layer_file_path.stem.split("-")[0]) + layer_id = layer_file_path.stem.split("-")[1] + """ load annotations from json""" + with open(layer_file_path) as f: + json_data = json.load(f) + absolute_base_path = layer_file_path.parents[4] + json_data = convert_relative_to_absolute_path(json_data, absolute_base_path) + annotation_store = AnnotationStore(string=json.dumps(json_data)) + + layer_annotations: Dict[str, Annotation] = {} + for annotation in annotation_store.annotations(): + annotation_id, segment = annotation.id(), str(annotation) + start = annotation.offset().begin().value() + end = annotation.offset().end().value() + layer_annotations[annotation_id] = Annotation( + segment=segment, start=start, end=end + ) + + return Layer( + id_=layer_id, + annotation_type=annotation_label, + annotations=layer_annotations, + ) + def set_annotation(self, annotation: Annotation): self.annotations[annotation.id_] = annotation From ba5e4d0b6042ea752712ce463352869da429ab8f Mon Sep 17 00:00:00 2001 From: Tenzin Date: Mon, 8 Jul 2024 10:14:42 +0530 Subject: [PATCH 36/49] refactor code --- src/openpecha/pecha/layer.py | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/src/openpecha/pecha/layer.py b/src/openpecha/pecha/layer.py index dc3d060..ac36c90 100644 --- a/src/openpecha/pecha/layer.py +++ b/src/openpecha/pecha/layer.py @@ -28,14 +28,6 @@ def get_annotation_category(layer_type: LayerEnum) -> LayerGroupEnum: return LayerGroupEnum.structure_type -def convert_relative_to_absolute_path(json_data, absolute_base_path: Path): - """call after loading the stam from json""" - for resource in json_data["resources"]: - original_path = Path(resource["@include"]) - resource["@include"] = str(absolute_base_path / original_path) - return json_data - - class Layer(BaseModel): id_: str = Field(default_factory=get_fourchar_uuid) annotation_type: LayerEnum @@ -76,14 +68,6 @@ def from_path(cls, layer_file_path: Path): def set_annotation(self, annotation: Annotation): self.annotations[annotation.id_] = annotation - def covert_to_relative_path(self, json_string: str, output_path: Path): - """convert the absolute path to relative path for base file path in json string""" - json_object = json.loads(json_string) - for resource in json_object["resources"]: - original_path = Path(resource["@include"]) - resource["@include"] = str(original_path.relative_to(output_path)) - return json_object - def write(self, base_file_path: Path, output_path: Path): base_file_path = base_file_path """write annotations in stam data model""" @@ -115,7 +99,7 @@ def write(self, base_file_path: Path, output_path: Path): ) """ save annotations in json""" json_string = self.annotation_store.to_json_string() - json_object = self.covert_to_relative_path(json_string, output_path) + json_object = convert_to_relative_path(json_string, output_path) """ add four uuid digits to the layer file name for uniqueness""" layer_dir = base_file_path.parent.parent / "layers" / base_file_path.stem layer_file_path = layer_dir / f"{self.annotation_type.value}-{self.id_}.json" @@ -124,3 +108,20 @@ def write(self, base_file_path: Path, output_path: Path): "w", ) as f: f.write(json.dumps(json_object, indent=4, ensure_ascii=False)) + + +def convert_relative_to_absolute_path(json_data, absolute_base_path: Path): + """call after loading the stam from json""" + for resource in json_data["resources"]: + original_path = Path(resource["@include"]) + resource["@include"] = str(absolute_base_path / original_path) + return json_data + + +def convert_to_relative_path(json_string: str, output_path: Path): + """convert the absolute path to relative path for base file path in json string""" + json_object = json.loads(json_string) + for resource in json_object["resources"]: + original_path = Path(resource["@include"]) + resource["@include"] = str(original_path.relative_to(output_path)) + return json_object From 029e6add8383a5c1e6de983dff3ecfd936311545 Mon Sep 17 00:00:00 2001 From: Tenzin Date: Mon, 8 Jul 2024 10:28:32 +0530 Subject: [PATCH 37/49] create test_pecha_read --- .../base/f2b056668a0c4ad3a085bdcd8e2d7adb.txt | 1 + .../Segment-bf13.json | 88 +++++++++++++++++++ .../IE7D6875F/IE7D6875F.opf/metadata.json | 3 + tests/pecha/read/test_pecha_read.py | 24 +++++ 4 files changed, 116 insertions(+) create mode 100644 tests/pecha/read/data/IE7D6875F/IE7D6875F.opf/base/f2b056668a0c4ad3a085bdcd8e2d7adb.txt create mode 100644 tests/pecha/read/data/IE7D6875F/IE7D6875F.opf/layers/f2b056668a0c4ad3a085bdcd8e2d7adb/Segment-bf13.json create mode 100644 tests/pecha/read/data/IE7D6875F/IE7D6875F.opf/metadata.json create mode 100644 tests/pecha/read/test_pecha_read.py diff --git a/tests/pecha/read/data/IE7D6875F/IE7D6875F.opf/base/f2b056668a0c4ad3a085bdcd8e2d7adb.txt b/tests/pecha/read/data/IE7D6875F/IE7D6875F.opf/base/f2b056668a0c4ad3a085bdcd8e2d7adb.txt new file mode 100644 index 0000000..0b166fc --- /dev/null +++ b/tests/pecha/read/data/IE7D6875F/IE7D6875F.opf/base/f2b056668a0c4ad3a085bdcd8e2d7adb.txt @@ -0,0 +1 @@ +རྒྱ་གར་སྐད་དུ། བོ་དྷི་སཏྭ་ཙརྱ་ཨ་བ་ཏཱ་ར།བོད་སྐད་དུ། བྱང་ཆུབ་སེམས་དཔའི་སྤྱོད་པ་ལ་འཇུག་པའི་ལེགས་པར་སྦྱར་བ། \ No newline at end of file diff --git a/tests/pecha/read/data/IE7D6875F/IE7D6875F.opf/layers/f2b056668a0c4ad3a085bdcd8e2d7adb/Segment-bf13.json b/tests/pecha/read/data/IE7D6875F/IE7D6875F.opf/layers/f2b056668a0c4ad3a085bdcd8e2d7adb/Segment-bf13.json new file mode 100644 index 0000000..92bcaec --- /dev/null +++ b/tests/pecha/read/data/IE7D6875F/IE7D6875F.opf/layers/f2b056668a0c4ad3a085bdcd8e2d7adb/Segment-bf13.json @@ -0,0 +1,88 @@ +{ + "@type": "AnnotationStore", + "@id": "PechaAnnotationStore", + "resources": [ + { + "@type": "TextResource", + "@id": "f2b056668a0c4ad3a085bdcd8e2d7adb.txt", + "@include": "IE7D6875F/IE7D6875F.opf/base/f2b056668a0c4ad3a085bdcd8e2d7adb.txt" + } + ], + "annotationsets": [ + { + "@type": "AnnotationDataSet", + "@id": "PechaDataSet", + "keys": [ + { + "@type": "DataKey", + "@id": "Structure Type" + } + ], + "data": [ + { + "@type": "AnnotationData", + "@id": "0c2c4165fb58464eabf9db0d6a3a1080", + "key": "Structure Type", + "value": { + "@type": "String", + "value": "Segment" + } + } + ] + } + ], + "annotations": [ + { + "@type": "Annotation", + "@id": "f2b056668a0c4ad3a085bdcd8e2d7adb", + "target": { + "@type": "TextSelector", + "resource": "f2b056668a0c4ad3a085bdcd8e2d7adb.txt", + "offset": { + "@type": "Offset", + "begin": { + "@type": "BeginAlignedCursor", + "value": 0 + }, + "end": { + "@type": "BeginAlignedCursor", + "value": 39 + } + } + }, + "data": [ + { + "@type": "AnnotationData", + "@id": "0c2c4165fb58464eabf9db0d6a3a1080", + "set": "PechaDataSet" + } + ] + }, + { + "@type": "Annotation", + "@id": "b696df2dbe314e8a87881a2bc391d0d5", + "target": { + "@type": "TextSelector", + "resource": "f2b056668a0c4ad3a085bdcd8e2d7adb.txt", + "offset": { + "@type": "Offset", + "begin": { + "@type": "BeginAlignedCursor", + "value": 39 + }, + "end": { + "@type": "BeginAlignedCursor", + "value": 103 + } + } + }, + "data": [ + { + "@type": "AnnotationData", + "@id": "0c2c4165fb58464eabf9db0d6a3a1080", + "set": "PechaDataSet" + } + ] + } + ] +} \ No newline at end of file diff --git a/tests/pecha/read/data/IE7D6875F/IE7D6875F.opf/metadata.json b/tests/pecha/read/data/IE7D6875F/IE7D6875F.opf/metadata.json new file mode 100644 index 0000000..cb740ab --- /dev/null +++ b/tests/pecha/read/data/IE7D6875F/IE7D6875F.opf/metadata.json @@ -0,0 +1,3 @@ +{ + "annotation_label": "Segment" +} \ No newline at end of file diff --git a/tests/pecha/read/test_pecha_read.py b/tests/pecha/read/test_pecha_read.py new file mode 100644 index 0000000..c2f3ff2 --- /dev/null +++ b/tests/pecha/read/test_pecha_read.py @@ -0,0 +1,24 @@ +from pathlib import Path + +from openpecha.pecha import Pecha +from openpecha.pecha.layer import Layer, LayerEnum + + +def test_pecha_read(): + DATA = Path(__file__).parent / "data" + pecha = Pecha.from_path(DATA / "IE7D6875F" / "IE7D6875F.opf") + assert pecha.pecha_id == "IE7D6875F" + assert "f2b056668a0c4ad3a085bdcd8e2d7adb" in pecha.bases + assert ( + pecha.bases["f2b056668a0c4ad3a085bdcd8e2d7adb"] + == "རྒྱ་གར་སྐད་དུ། བོ་དྷི་སཏྭ་ཙརྱ་ཨ་བ་ཏཱ་ར།བོད་སྐད་དུ། བྱང་ཆུབ་སེམས་དཔའི་སྤྱོད་པ་ལ་འཇུག་པའི་ལེགས་པར་སྦྱར་བ།" + ) + + for layer_key, layer in pecha.layers["f2b056668a0c4ad3a085bdcd8e2d7adb"].items(): + annotation_type, layer_id = layer_key + assert annotation_type == LayerEnum.segment + assert isinstance(layer_id, str) + assert isinstance(layer, Layer) + + +test_pecha_read() From 72ef523dfe35337fb7dbf7472d776cd6628a92b6 Mon Sep 17 00:00:00 2001 From: Tenzin Date: Mon, 8 Jul 2024 10:32:38 +0530 Subject: [PATCH 38/49] fix/set annotation id in layer classmethod from path --- src/openpecha/pecha/layer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/openpecha/pecha/layer.py b/src/openpecha/pecha/layer.py index ac36c90..4a6ad9f 100644 --- a/src/openpecha/pecha/layer.py +++ b/src/openpecha/pecha/layer.py @@ -56,7 +56,7 @@ def from_path(cls, layer_file_path: Path): start = annotation.offset().begin().value() end = annotation.offset().end().value() layer_annotations[annotation_id] = Annotation( - segment=segment, start=start, end=end + id_=annotation_id, segment=segment, start=start, end=end ) return Layer( From 2f289666ca739456ec34d25c0de578e4c94dbaa7 Mon Sep 17 00:00:00 2001 From: Tenzin Date: Mon, 8 Jul 2024 12:04:05 +0530 Subject: [PATCH 39/49] Layer get_annotations --- src/openpecha/pecha/layer.py | 21 +++++++++++++++++++++ tests/pecha/read/test_pecha_read.py | 20 ++++++++++++++++++++ 2 files changed, 41 insertions(+) diff --git a/src/openpecha/pecha/layer.py b/src/openpecha/pecha/layer.py index 4a6ad9f..efc144f 100644 --- a/src/openpecha/pecha/layer.py +++ b/src/openpecha/pecha/layer.py @@ -5,6 +5,7 @@ from typing import Dict, Optional from pydantic import BaseModel, ConfigDict, Field +from stam import Annotation as StamAnnotation from stam import AnnotationDataSet, AnnotationStore, Offset, Selector from openpecha.config import PECHA_ANNOTATION_STORE_ID, PECHA_DATASET_ID @@ -63,8 +64,28 @@ def from_path(cls, layer_file_path: Path): id_=layer_id, annotation_type=annotation_label, annotations=layer_annotations, + annotation_store=annotation_store, ) + def get_annotations(self): + if not self.annotation_store: + return None + for ann in self.annotation_store: + yield self.parse_annotation(ann) + + def get_annotation(self, ann_id: str): + if not self.annotation_store: + return None + ann = self.annotation_store.annotation(id=ann_id) + return self.parse_annotation(ann) + + def parse_annotation(self, ann: StamAnnotation): + ann_id = ann.id() + ann_segment = str(ann) + start = ann.offset().begin().value() + end = ann.offset().end().value() + return {"id": ann_id, "segment": ann_segment, "start": start, "end": end} + def set_annotation(self, annotation: Annotation): self.annotations[annotation.id_] = annotation diff --git a/tests/pecha/read/test_pecha_read.py b/tests/pecha/read/test_pecha_read.py index c2f3ff2..97b5e03 100644 --- a/tests/pecha/read/test_pecha_read.py +++ b/tests/pecha/read/test_pecha_read.py @@ -20,5 +20,25 @@ def test_pecha_read(): assert isinstance(layer_id, str) assert isinstance(layer, Layer) + first_layer = pecha.layers["f2b056668a0c4ad3a085bdcd8e2d7adb"][ + (LayerEnum.segment, "bf13") + ] + + annotations = list(first_layer.get_annotations()) + assert annotations == [ + { + "id": "f2b056668a0c4ad3a085bdcd8e2d7adb", + "segment": "རྒྱ་གར་སྐད་དུ། བོ་དྷི་སཏྭ་ཙརྱ་ཨ་བ་ཏཱ་ར།", + "start": 0, + "end": 39, + }, + { + "id": "b696df2dbe314e8a87881a2bc391d0d5", + "segment": "བོད་སྐད་དུ། བྱང་ཆུབ་སེམས་དཔའི་སྤྱོད་པ་ལ་འཇུག་པའི་ལེགས་པར་སྦྱར་བ།", + "start": 39, + "end": 103, + }, + ] + test_pecha_read() From 783927457a81424f38ebc67b9325a0a2ef6c9526 Mon Sep 17 00:00:00 2001 From: Tenzin Date: Mon, 8 Jul 2024 14:47:38 +0530 Subject: [PATCH 40/49] add/ annotation metadata in parse_annotation --- src/openpecha/pecha/layer.py | 14 +++++++++++++- tests/pecha/read/test_pecha_read.py | 4 ++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/src/openpecha/pecha/layer.py b/src/openpecha/pecha/layer.py index efc144f..1a4aea6 100644 --- a/src/openpecha/pecha/layer.py +++ b/src/openpecha/pecha/layer.py @@ -84,7 +84,19 @@ def parse_annotation(self, ann: StamAnnotation): ann_segment = str(ann) start = ann.offset().begin().value() end = ann.offset().end().value() - return {"id": ann_id, "segment": ann_segment, "start": start, "end": end} + + parsed_ann = {"id": ann_id, "segment": ann_segment, "start": start, "end": end} + + for ann_data in ann: + key, value = ann_data.key().id(), str(ann_data.value()) + if key in LayerGroupEnum._value2member_map_: + parsed_ann["annotation_category"] = key + parsed_ann["annotation_type"] = value + else: + parsed_ann["payloads"] = defaultdict(str) + parsed_ann["payloads"][key] = value + + return parsed_ann def set_annotation(self, annotation: Annotation): self.annotations[annotation.id_] = annotation diff --git a/tests/pecha/read/test_pecha_read.py b/tests/pecha/read/test_pecha_read.py index 97b5e03..fe80e9c 100644 --- a/tests/pecha/read/test_pecha_read.py +++ b/tests/pecha/read/test_pecha_read.py @@ -31,12 +31,16 @@ def test_pecha_read(): "segment": "རྒྱ་གར་སྐད་དུ། བོ་དྷི་སཏྭ་ཙརྱ་ཨ་བ་ཏཱ་ར།", "start": 0, "end": 39, + "annotation_category": "Structure Type", + "annotation_type": "Segment", }, { "id": "b696df2dbe314e8a87881a2bc391d0d5", "segment": "བོད་སྐད་དུ། བྱང་ཆུབ་སེམས་དཔའི་སྤྱོད་པ་ལ་འཇུག་པའི་ལེགས་པར་སྦྱར་བ།", "start": 39, "end": 103, + "annotation_category": "Structure Type", + "annotation_type": "Segment", }, ] From fba97e27a636da71f18f0e3a9d452d393aeb2cbb Mon Sep 17 00:00:00 2001 From: Tenzin Date: Mon, 8 Jul 2024 15:02:33 +0530 Subject: [PATCH 41/49] write ann metadata to stam if exist --- src/openpecha/pecha/layer.py | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/src/openpecha/pecha/layer.py b/src/openpecha/pecha/layer.py index 1a4aea6..b1ed743 100644 --- a/src/openpecha/pecha/layer.py +++ b/src/openpecha/pecha/layer.py @@ -2,7 +2,7 @@ from collections import defaultdict from enum import Enum from pathlib import Path -from typing import Dict, Optional +from typing import Dict, Optional, Tuple from pydantic import BaseModel, ConfigDict, Field from stam import Annotation as StamAnnotation @@ -111,20 +111,45 @@ def write(self, base_file_path: Path, output_path: Path): self.dataset = self.annotation_store.add_dataset(id=PECHA_DATASET_ID) annotation_category = get_annotation_category(self.annotation_type).value self.dataset.add_key(annotation_category) - unique_annotation_data_id = get_uuid() + + unique_ann_data_id = get_uuid() + ann_data_ids: Dict[Tuple[str, str], str] = {} + for annotation_id, annotation in self.annotations.items(): target = Selector.textselector( resource, Offset.simple(annotation.start, annotation.end), ) + data = [ { - "id": unique_annotation_data_id, + "id": unique_ann_data_id, "key": annotation_category, "value": self.annotation_type.value, "set": self.dataset.id(), } ] + """ + add metadata to the annotation if exists + if the metadata is already added, get the id from the dictionary, + else create a new id and add to the dictionary + """ + if annotation.metadata: + for key, value in annotation.metadata.items(): + if (key, value) in ann_data_ids: + ann_data_id = ann_data_ids[(key, value)] + else: + ann_data_id = get_uuid() + ann_data_ids[(key, value)] = ann_data_id + data.append( + { + "id": ann_data_id, + "key": key, + "value": value, + "set": self.dataset.id(), + } + ) + self.annotation_store.annotate( id=annotation_id, target=target, From 2ca71a5c3bccbb0f07e8a4748b9335595c2b76a7 Mon Sep 17 00:00:00 2001 From: Tenzin Date: Mon, 8 Jul 2024 16:20:06 +0530 Subject: [PATCH 42/49] read pecha metadata from_path --- src/openpecha/pecha/__init__.py | 43 ++++++++++++++++--- src/openpecha/pecha/metadata.py | 15 ++++--- .../IE7D6875F/IE7D6875F.opf/metadata.json | 4 +- 3 files changed, 48 insertions(+), 14 deletions(-) diff --git a/src/openpecha/pecha/__init__.py b/src/openpecha/pecha/__init__.py index 1a21d40..09f0deb 100644 --- a/src/openpecha/pecha/__init__.py +++ b/src/openpecha/pecha/__init__.py @@ -6,7 +6,11 @@ from openpecha.config import PECHAS_PATH, _mkdir from openpecha.ids import get_uuid from openpecha.pecha.layer import Layer, LayerEnum -from openpecha.pecha.metadata import PechaMetadata, to_json_serializable +from openpecha.pecha.metadata import ( + InitialCreationType, + PechaMetadata, + to_json_serializable, +) class Pecha: @@ -26,12 +30,14 @@ def __init__( @classmethod def from_path(cls, base_path: Path): - pecha_id = base_path.stem - pecha = Pecha(pecha_id=pecha_id) - # with open(base_path / "metadata.json", encoding="utf-8") as f: - # metadata = json.load(f) - # pecha.set_metadata(metadata) + with open(base_path / "metadata.json", encoding="utf-8") as f: + metadata = json.load(f) + metadata = json.loads(metadata) + + preprocessed_meta = preprocess_metadata(metadata) + pecha_metadata = PechaMetadata(**preprocessed_meta) + pecha = Pecha(metadata=pecha_metadata) for base_file in (base_path / "base").rglob("*"): base_text = base_file.read_text(encoding="utf-8") @@ -98,3 +104,28 @@ def write(self, output_path: Path = PECHAS_PATH): base_file_path=base_dir / f"{layer_name}.txt", output_path=output_path, ) + + +def preprocess_metadata(metadata: Dict) -> Dict: + # Replace null values with default values + processed_metadata = { + "id_": metadata.get("id_", ""), + "title": metadata.get("title", []) if metadata.get("title") is not None else [], + "author": metadata.get("author", []) + if metadata.get("author") is not None + else [], + "source": metadata.get("source", "") + if metadata.get("source") is not None + else "", + "language": metadata.get("language", "") + if metadata.get("language") is not None + else "", + "initial_creation_type": InitialCreationType(metadata["initial_creation_type"]) + if "initial_creation_type" in metadata + else None, + "created_at": metadata.get("created_at"), + "source_metadata": metadata.get("source_metadata", {}) + if metadata.get("source_metadata") is not None + else {}, + } + return processed_metadata diff --git a/src/openpecha/pecha/metadata.py b/src/openpecha/pecha/metadata.py index a7fa7f0..a7d6840 100644 --- a/src/openpecha/pecha/metadata.py +++ b/src/openpecha/pecha/metadata.py @@ -18,16 +18,16 @@ class InitialCreationType(Enum): class PechaMetadata(BaseModel): id_: str = Field(default=None, alias="id_") - title: List[str] = Field(default=None, alias="title") - author: List[str] = Field(default=None, alias="author") + title: List[str] = Field(default=list, alias="title") + author: List[str] = Field(default=list, alias="author") source: str = Field(default=None, alias="source") language: str = Field(default=None, alias="language") initial_creation_type: InitialCreationType = Field( None, alias="initial_creation_type" ) - created_at: datetime = Field(default=None, alias="created_at") + created_at: Optional[datetime] = Field(default=None, alias="created_at") source_metadata: Optional[Dict] = Field( - default={} + default=dict ) # place to dump any metadata from the source @field_validator("created_at", mode="before") @@ -48,10 +48,15 @@ def to_json_serializable(pecha_metadata: Optional[PechaMetadata]) -> str: # Convert the model to a dictionary dict_data = pecha_metadata.model_dump() # Convert the defaultdict to a regular dictionary - dict_data["source_metadata"] = dict(dict_data["source_metadata"]) # Convert the initial_creation_type enum to its value if dict_data["initial_creation_type"] is not None: dict_data["initial_creation_type"] = dict_data["initial_creation_type"].value + for k, v in dict_data.items(): + if v is list: + dict_data[k] = [] + continue + if v is dict: + dict_data[k] = {} return json.dumps(dict_data, indent=4, ensure_ascii=False) diff --git a/tests/pecha/read/data/IE7D6875F/IE7D6875F.opf/metadata.json b/tests/pecha/read/data/IE7D6875F/IE7D6875F.opf/metadata.json index cb740ab..38be7bc 100644 --- a/tests/pecha/read/data/IE7D6875F/IE7D6875F.opf/metadata.json +++ b/tests/pecha/read/data/IE7D6875F/IE7D6875F.opf/metadata.json @@ -1,3 +1 @@ -{ - "annotation_label": "Segment" -} \ No newline at end of file +"{\n \"id_\": \"IE7D6875F\",\n \"title\": null,\n \"author\": null,\n \"source\": null,\n \"language\": null,\n \"initial_creation_type\": \"input\",\n \"created_at\": null,\n \"source_metadata\": {}\n}" \ No newline at end of file From 3a2e5af84fd2a7215312752d4d7395e9bccec644 Mon Sep 17 00:00:00 2001 From: Tenzin Date: Wed, 10 Jul 2024 08:28:08 +0530 Subject: [PATCH 43/49] modify path assignment --- src/openpecha/config.py | 2 ++ src/openpecha/pecha/__init__.py | 11 +++++++---- tests/pecha/read/test_pecha_read.py | 2 +- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/src/openpecha/config.py b/src/openpecha/config.py index e0fa952..8d34098 100644 --- a/src/openpecha/config.py +++ b/src/openpecha/config.py @@ -9,6 +9,8 @@ def _mkdir(path): return path +ORG_NAME = "PechaData" + BASE_PATH = _mkdir(Path.home() / ".pechadata") PECHAS_PATH = _mkdir(BASE_PATH / "pechas") diff --git a/src/openpecha/pecha/__init__.py b/src/openpecha/pecha/__init__.py index 09f0deb..4662a93 100644 --- a/src/openpecha/pecha/__init__.py +++ b/src/openpecha/pecha/__init__.py @@ -29,8 +29,9 @@ def __init__( self.metadata = metadata @classmethod - def from_path(cls, base_path: Path): - + def from_path(cls, pecha_path: Path): + pecha_id = pecha_path.stem + base_path = pecha_path / f"{pecha_id}.opf" with open(base_path / "metadata.json", encoding="utf-8") as f: metadata = json.load(f) metadata = json.loads(metadata) @@ -38,6 +39,7 @@ def from_path(cls, base_path: Path): preprocessed_meta = preprocess_metadata(metadata) pecha_metadata = PechaMetadata(**preprocessed_meta) pecha = Pecha(metadata=pecha_metadata) + pecha.pecha_path = pecha_path for base_file in (base_path / "base").rglob("*"): base_text = base_file.read_text(encoding="utf-8") @@ -77,8 +79,9 @@ def write(self, output_path: Path = PECHAS_PATH): if not self.pecha_id: raise ValueError("pecha_id must be set before writing.") - pecha_dir = _mkdir(output_path / self.pecha_id) - self.base_path = _mkdir(pecha_dir / f"{self.pecha_id}.opf") + self.pecha_path = _mkdir(output_path / self.pecha_id) + + self.base_path = _mkdir(self.pecha_path / f"{self.pecha_id}.opf") """ write metadata """ self.metadata_fn = self.base_path / "metadata.json" self.metadata_fn.write_text( diff --git a/tests/pecha/read/test_pecha_read.py b/tests/pecha/read/test_pecha_read.py index fe80e9c..fbd4f9a 100644 --- a/tests/pecha/read/test_pecha_read.py +++ b/tests/pecha/read/test_pecha_read.py @@ -6,7 +6,7 @@ def test_pecha_read(): DATA = Path(__file__).parent / "data" - pecha = Pecha.from_path(DATA / "IE7D6875F" / "IE7D6875F.opf") + pecha = Pecha.from_path(DATA / "IE7D6875F") assert pecha.pecha_id == "IE7D6875F" assert "f2b056668a0c4ad3a085bdcd8e2d7adb" in pecha.bases assert ( From cfc24147ddcc644aa696b72629a4696ae8fe3005 Mon Sep 17 00:00:00 2001 From: Tenzin Date: Wed, 10 Jul 2024 08:41:54 +0530 Subject: [PATCH 44/49] upload files to github repo --- pyproject.toml | 2 +- src/openpecha/github_utils.py | 37 +++++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 1 deletion(-) create mode 100644 src/openpecha/github_utils.py diff --git a/pyproject.toml b/pyproject.toml index b0a336a..405713f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,7 +21,7 @@ dependencies = [ "pydantic >= 2.7.4", "stam == 0.8.2", "collection >= 0.1.6", - + "PyGithub >= 2.3.0", ] [project.optional-dependencies] diff --git a/src/openpecha/github_utils.py b/src/openpecha/github_utils.py new file mode 100644 index 0000000..4237cda --- /dev/null +++ b/src/openpecha/github_utils.py @@ -0,0 +1,37 @@ +import os +from pathlib import Path + +from github import Github, GithubException + +from openpecha.config import ORG_NAME + +GITHUB_TOKEN = os.getenv("GITHUB_TOKEN") +if not GITHUB_TOKEN: + raise Exception("GITHUB_TOKEN is not set in the environment.") + + +def create_github_repo(repo_name: str): + try: + g = Github(GITHUB_TOKEN) + org = g.get_organization(ORG_NAME) + org.create_repo(repo_name) + + except GithubException as e: + raise GithubException(f"Error creating repo: {e}") + + +def upload_files_to_github_repo(repo_name: str, folder_path: Path): + try: + g = Github(GITHUB_TOKEN) + org = g.get_organization(ORG_NAME) + repo = org.get_repo(repo_name) + + for file in folder_path.rglob("*"): + if file.is_dir(): + continue + file_path = file.relative_to(folder_path) + with open(file) as f: + content = f.read() + repo.create_file(str(file_path), f"committing {file.name}", content) + except GithubException as e: + raise GithubException(f"Error uploading files to github: {e}") From a03050caeb6836b799dd050730c0647fd1c56a32 Mon Sep 17 00:00:00 2001 From: Tenzin Date: Wed, 10 Jul 2024 08:47:23 +0530 Subject: [PATCH 45/49] delete unneccessary lines --- src/openpecha/pecha/metadata.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/openpecha/pecha/metadata.py b/src/openpecha/pecha/metadata.py index a7d6840..8220539 100644 --- a/src/openpecha/pecha/metadata.py +++ b/src/openpecha/pecha/metadata.py @@ -45,10 +45,7 @@ def to_json_serializable(pecha_metadata: Optional[PechaMetadata]) -> str: if pecha_metadata is None: return json.dumps({}, indent=4, ensure_ascii=False) - # Convert the model to a dictionary dict_data = pecha_metadata.model_dump() - # Convert the defaultdict to a regular dictionary - # Convert the initial_creation_type enum to its value if dict_data["initial_creation_type"] is not None: dict_data["initial_creation_type"] = dict_data["initial_creation_type"].value for k, v in dict_data.items(): From 32e10e60646ec38f1b97dc93b166522968ec63a0 Mon Sep 17 00:00:00 2001 From: Tenzin Date: Wed, 10 Jul 2024 09:03:16 +0530 Subject: [PATCH 46/49] Pecha classmethod from_id --- src/openpecha/github_utils.py | 22 ++++++++++++++++++++++ src/openpecha/pecha/__init__.py | 4 +++- 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/src/openpecha/github_utils.py b/src/openpecha/github_utils.py index 4237cda..7a15d28 100644 --- a/src/openpecha/github_utils.py +++ b/src/openpecha/github_utils.py @@ -1,5 +1,7 @@ import os +import subprocess from pathlib import Path +from shutil import rmtree from github import Github, GithubException @@ -35,3 +37,23 @@ def upload_files_to_github_repo(repo_name: str, folder_path: Path): repo.create_file(str(file_path), f"committing {file.name}", content) except GithubException as e: raise GithubException(f"Error uploading files to github: {e}") + + +def clone_github_repo(repo_name: str, destination_folder: Path): + repo_path = destination_folder / repo_name + if repo_path.exists(): + rmtree(repo_path) + else: + try: + repo_url = f"https://github.com/{ORG_NAME}/{repo_name}.git" + env = {"GIT_ASKPASS": "echo", "GIT_PASSWORD": GITHUB_TOKEN} + subprocess.run( + ["git", "clone", repo_url, str(repo_path)], + check=True, + capture_output=True, + env={k: str(v) for k, v in env.items()}, + ) + return repo_path + except subprocess.CalledProcessError as e: + print(f"Error cloning {repo_name} repository: {e}") + return None diff --git a/src/openpecha/pecha/__init__.py b/src/openpecha/pecha/__init__.py index 4662a93..7bfff2c 100644 --- a/src/openpecha/pecha/__init__.py +++ b/src/openpecha/pecha/__init__.py @@ -4,6 +4,7 @@ from typing import Dict, Optional, Tuple from openpecha.config import PECHAS_PATH, _mkdir +from openpecha.github_utils import clone_github_repo from openpecha.ids import get_uuid from openpecha.pecha.layer import Layer, LayerEnum from openpecha.pecha.metadata import ( @@ -54,7 +55,8 @@ def from_path(cls, pecha_path: Path): @classmethod def from_id(cls, pecha_id: str): - pass + repo_path = clone_github_repo(pecha_id, PECHAS_PATH) + return cls.from_path(repo_path) def set_base_file(self, base_text: str, base_file_name: str = None) -> str: base_file_name = base_file_name if base_file_name else get_uuid()[:4] From f13cc29764a9eeca38efff3de9b5ea1e5a66d9b6 Mon Sep 17 00:00:00 2001 From: Tenzin Date: Wed, 10 Jul 2024 09:15:59 +0530 Subject: [PATCH 47/49] setup a dummy GITHUB_TOKEN --- .github/workflows/CI.yml | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index bdd06bd..bad05e6 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -16,20 +16,25 @@ jobs: steps: - uses: actions/checkout@v3 - + - name: Set up Python 3.8 uses: actions/setup-python@v3 with: python-version: "3.8" - + - name: Install dependencies run: | pip install -U pip pip install . pip install .[dev] - + + - name: Set up GITHUB_TOKEN + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: echo "GITHUB_TOKEN is set up" + - name: Test with pytest run: PYTHONPATH=src pytest - + - name: Test Coverage run: PYTHONPATH=src pytest --cov project_name From 961cef2f11ad77c728ec368be6d16b0464bb2cf5 Mon Sep 17 00:00:00 2001 From: Tenzin Date: Wed, 10 Jul 2024 09:50:08 +0530 Subject: [PATCH 48/49] update CI --- .github/workflows/CI.yml | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index bad05e6..830631e 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -28,13 +28,12 @@ jobs: pip install . pip install .[dev] - - name: Set up GITHUB_TOKEN - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: echo "GITHUB_TOKEN is set up" - - name: Test with pytest run: PYTHONPATH=src pytest + env: + GITHUB_TOKEN: "" - name: Test Coverage - run: PYTHONPATH=src pytest --cov project_name + run: PYTHONPATH=src pytest --cov openpecha + env: + GITHUB_TOKEN: "" From 154514a3d618fc13e102e8c38e7d1133ce2c44df Mon Sep 17 00:00:00 2001 From: Tenzin Date: Wed, 10 Jul 2024 09:58:29 +0530 Subject: [PATCH 49/49] update CI --- .github/workflows/CI.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 830631e..288c688 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -31,9 +31,9 @@ jobs: - name: Test with pytest run: PYTHONPATH=src pytest env: - GITHUB_TOKEN: "" + GITHUB_TOKEN: " " - name: Test Coverage run: PYTHONPATH=src pytest --cov openpecha env: - GITHUB_TOKEN: "" + GITHUB_TOKEN: " "