diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index bdd06bd..288c688 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -16,20 +16,24 @@ jobs: steps: - uses: actions/checkout@v3 - + - name: Set up Python 3.8 uses: actions/setup-python@v3 with: python-version: "3.8" - + - name: Install dependencies run: | pip install -U pip pip install . pip install .[dev] - + - name: Test with pytest run: PYTHONPATH=src pytest - + env: + GITHUB_TOKEN: " " + - name: Test Coverage - run: PYTHONPATH=src pytest --cov project_name + run: PYTHONPATH=src pytest --cov openpecha + env: + GITHUB_TOKEN: " " diff --git a/pyproject.toml b/pyproject.toml index 8f147a9..405713f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,8 @@ classifiers = [ dependencies = [ "pydantic >= 2.7.4", "stam == 0.8.2", - + "collection >= 0.1.6", + "PyGithub >= 2.3.0", ] [project.optional-dependencies] diff --git a/src/openpecha/alignment/parsers/plaintext.py b/src/openpecha/alignment/parsers/plaintext.py index 92a1dfd..30c1671 100644 --- a/src/openpecha/alignment/parsers/plaintext.py +++ b/src/openpecha/alignment/parsers/plaintext.py @@ -1,10 +1,10 @@ from pathlib import Path -from typing import Dict +from typing import List -from openpecha.ids import get_initial_pecha_id, get_uuid from openpecha.pecha import Pecha from openpecha.pecha.annotation import Annotation from openpecha.pecha.layer import Layer, LayerEnum +from openpecha.pecha.metadata import InitialCreationType, InitialPechaMetadata class PlainTextLineAlignedParser: @@ -19,55 +19,48 @@ def from_files(cls, source_path: Path, target_path: Path, metadata: dict): target_text = target_path.read_text(encoding="utf-8") return cls(source_text, target_text, metadata) - def create_pecha_layer(self, base_text: str, annotation: LayerEnum): + def create_pecha_layer(self, segments: List[str], annotation_type: LayerEnum): """ """ - layer_annotations: Dict[str, Annotation] = {} + layer = Layer(annotation_type=annotation_type) char_count = 0 - for segment in base_text.split("\n"): - layer_annotations[get_uuid()] = Annotation( - id_=get_uuid(), - segment=segment, + for segment in segments: + annotation = Annotation( start=char_count, end=char_count + len(segment), ) + layer.set_annotation(annotation) char_count += len(segment) - return Layer(annotation_label=annotation, annotations=layer_annotations) + return layer def parse(self): - source_pecha_id, target_pecha_id = ( - get_initial_pecha_id(), - get_initial_pecha_id(), + source_pecha_metadata, target_pecha_metadata = ( + InitialPechaMetadata( + initial_creation_type=InitialCreationType.input, + source_metadata=self.metadata["source"], + ), + InitialPechaMetadata( + initial_creation_type=InitialCreationType.input, + source_metadata=self.metadata["target"], + ), ) + source_pecha = Pecha(metadata=source_pecha_metadata) + target_pecha = Pecha(metadata=target_pecha_metadata) - source_base_fname, target_base_fname = get_uuid(), get_uuid() - source_base_files = {source_base_fname: self.source_text} - target_base_files = {target_base_fname: self.target_text} + source_base_name = source_pecha.set_base_file(self.source_text) + target_base_name = target_pecha.set_base_file(self.target_text) - source_annotation = LayerEnum(self.metadata["source"]["annotation_label"]) - target_annotation = LayerEnum(self.metadata["target"]["annotation_label"]) - - source_layers = { - source_base_fname: { - source_annotation: self.create_pecha_layer( - self.source_text, source_annotation - ) - } - } - target_layers = { - target_base_fname: { - target_annotation: self.create_pecha_layer( - self.target_text, target_annotation - ), - } - } - - source_pecha = Pecha( # noqa - source_pecha_id, source_base_files, source_layers, self.metadata["source"] + source_pecha.set_layer( + source_base_name, + LayerEnum.segment, + self.create_pecha_layer(self.source_text.split("\n"), LayerEnum.segment), ) - target_pecha = Pecha( # noqa - target_pecha_id, target_base_files, target_layers, self.metadata["target"] + target_pecha.set_layer( + target_base_name, + LayerEnum.segment, + self.create_pecha_layer(self.target_text.split("\n"), LayerEnum.segment), ) + return source_pecha, target_pecha # TODO: diff --git a/src/openpecha/config.py b/src/openpecha/config.py index e0fa952..8d34098 100644 --- a/src/openpecha/config.py +++ b/src/openpecha/config.py @@ -9,6 +9,8 @@ def _mkdir(path): return path +ORG_NAME = "PechaData" + BASE_PATH = _mkdir(Path.home() / ".pechadata") PECHAS_PATH = _mkdir(BASE_PATH / "pechas") diff --git a/src/openpecha/github_utils.py b/src/openpecha/github_utils.py new file mode 100644 index 0000000..7a15d28 --- /dev/null +++ b/src/openpecha/github_utils.py @@ -0,0 +1,59 @@ +import os +import subprocess +from pathlib import Path +from shutil import rmtree + +from github import Github, GithubException + +from openpecha.config import ORG_NAME + +GITHUB_TOKEN = os.getenv("GITHUB_TOKEN") +if not GITHUB_TOKEN: + raise Exception("GITHUB_TOKEN is not set in the environment.") + + +def create_github_repo(repo_name: str): + try: + g = Github(GITHUB_TOKEN) + org = g.get_organization(ORG_NAME) + org.create_repo(repo_name) + + except GithubException as e: + raise GithubException(f"Error creating repo: {e}") + + +def upload_files_to_github_repo(repo_name: str, folder_path: Path): + try: + g = Github(GITHUB_TOKEN) + org = g.get_organization(ORG_NAME) + repo = org.get_repo(repo_name) + + for file in folder_path.rglob("*"): + if file.is_dir(): + continue + file_path = file.relative_to(folder_path) + with open(file) as f: + content = f.read() + repo.create_file(str(file_path), f"committing {file.name}", content) + except GithubException as e: + raise GithubException(f"Error uploading files to github: {e}") + + +def clone_github_repo(repo_name: str, destination_folder: Path): + repo_path = destination_folder / repo_name + if repo_path.exists(): + rmtree(repo_path) + else: + try: + repo_url = f"https://github.com/{ORG_NAME}/{repo_name}.git" + env = {"GIT_ASKPASS": "echo", "GIT_PASSWORD": GITHUB_TOKEN} + subprocess.run( + ["git", "clone", repo_url, str(repo_path)], + check=True, + capture_output=True, + env={k: str(v) for k, v in env.items()}, + ) + return repo_path + except subprocess.CalledProcessError as e: + print(f"Error cloning {repo_name} repository: {e}") + return None diff --git a/src/openpecha/ids.py b/src/openpecha/ids.py index b27d246..9560ae5 100644 --- a/src/openpecha/ids.py +++ b/src/openpecha/ids.py @@ -6,6 +6,10 @@ def get_uuid(): return uuid4().hex +def get_fourchar_uuid(): + return get_uuid()[:4] + + def get_id(prefix, length): return prefix + "".join(random.choices(uuid4().hex, k=length)).upper() diff --git a/src/openpecha/pecha/__init__.py b/src/openpecha/pecha/__init__.py index 781d99e..7bfff2c 100644 --- a/src/openpecha/pecha/__init__.py +++ b/src/openpecha/pecha/__init__.py @@ -1,64 +1,136 @@ import json +from collections import defaultdict from pathlib import Path -from shutil import rmtree -from typing import Dict +from typing import Dict, Optional, Tuple -from stam import AnnotationStore, Offset, Selector - -from openpecha.config import ( - PECHA_ANNOTATION_STORE_ID, - PECHA_DATASET_ID, - PECHAS_PATH, - _mkdir, -) +from openpecha.config import PECHAS_PATH, _mkdir +from openpecha.github_utils import clone_github_repo from openpecha.ids import get_uuid -from openpecha.pecha.annotation import Annotation from openpecha.pecha.layer import Layer, LayerEnum +from openpecha.pecha.metadata import ( + InitialCreationType, + PechaMetadata, + to_json_serializable, +) class Pecha: def __init__( self, - pecha_id: str, - bases: Dict[str, str], - layers: Dict[str, Dict[LayerEnum, Layer]], - metadata: Dict[str, str], + pecha_id: str = None, + bases: Dict[str, str] = defaultdict(str), + layers: Dict[str, Dict[Tuple[LayerEnum, str], Layer]] = defaultdict( + lambda: defaultdict() + ), + metadata: PechaMetadata = None, ) -> None: - self.pecha_id = pecha_id + self.pecha_id = metadata.id_ if metadata else pecha_id self.bases = bases self.layers = layers self.metadata = metadata @classmethod - def from_path(cls, path: str): - pass + def from_path(cls, pecha_path: Path): + pecha_id = pecha_path.stem + base_path = pecha_path / f"{pecha_id}.opf" + with open(base_path / "metadata.json", encoding="utf-8") as f: + metadata = json.load(f) + metadata = json.loads(metadata) + + preprocessed_meta = preprocess_metadata(metadata) + pecha_metadata = PechaMetadata(**preprocessed_meta) + pecha = Pecha(metadata=pecha_metadata) + pecha.pecha_path = pecha_path + + for base_file in (base_path / "base").rglob("*"): + base_text = base_file.read_text(encoding="utf-8") + pecha.set_base_file(base_text, base_file.stem) + + for layer_dir in (base_path / "layers").iterdir(): + for layer_file in layer_dir.glob("*.json"): + layer = Layer.from_path(layer_file) + pecha.set_layer(layer_dir.stem, layer.annotation_type, layer, layer.id_) + + return pecha @classmethod def from_id(cls, pecha_id: str): - pass + repo_path = clone_github_repo(pecha_id, PECHAS_PATH) + return cls.from_path(repo_path) - def write(self, export_path: Path = PECHAS_PATH): + def set_base_file(self, base_text: str, base_file_name: str = None) -> str: + base_file_name = base_file_name if base_file_name else get_uuid()[:4] + self.bases[base_file_name] = base_text + return base_file_name - pecha_dir = _mkdir(export_path / self.pecha_id) - self.base_path = _mkdir(pecha_dir / f"{self.pecha_id}.opf") + def set_layer( + self, + base_name: str, + annotation_type: LayerEnum, + layer: Layer, + layer_subtype_id: str = None, + ) -> str: + + """layer key is a tuple of layer label and layer id""" + """ A particular volume can have multiple layers with same label but different id""" + layer_subtype_id = get_uuid()[:4] if not layer_subtype_id else layer_subtype_id + self.layers[base_name][(annotation_type, layer_subtype_id)] = layer + return layer_subtype_id + + def write(self, output_path: Path = PECHAS_PATH): + if not self.pecha_id: + raise ValueError("pecha_id must be set before writing.") + + self.pecha_path = _mkdir(output_path / self.pecha_id) + + self.base_path = _mkdir(self.pecha_path / f"{self.pecha_id}.opf") """ write metadata """ self.metadata_fn = self.base_path / "metadata.json" self.metadata_fn.write_text( - json.dumps(self.metadata, indent=4, ensure_ascii=False), encoding="utf-8" + json.dumps( + to_json_serializable(self.metadata), indent=4, ensure_ascii=False + ), + encoding="utf-8", ) """ write base file""" - base_dir = _mkdir(self.base_path / "base") - for base_fname, base_text in self.bases.items(): - base_fn = base_dir / f"{base_fname}.txt" - base_fn.write_text(base_text, encoding="utf-8") - - layer_dir = _mkdir(self.base_path / "layers") - """ write annotation layers""" - for layer_fname, layer_data in self.layers.items(): - for _, layer in layer_data.items(): - _mkdir(layer_dir / layer_fname) - layer.write( - base_file_path=base_dir / layer_fname, - export_path=layer_dir / layer_fname, - ) + if self.bases: + base_dir = _mkdir(self.base_path / "base") + for base_name, base_text in self.bases.items(): + base_fn = base_dir / f"{base_name}.txt" + base_fn.write_text(base_text, encoding="utf-8") + if self.layers: + layer_dir = _mkdir(self.base_path / "layers") + """ write annotation layers""" + for layer_name, layer_data in self.layers.items(): + for _, layer in layer_data.items(): + _mkdir(layer_dir / layer_name) + layer.write( + base_file_path=base_dir / f"{layer_name}.txt", + output_path=output_path, + ) + + +def preprocess_metadata(metadata: Dict) -> Dict: + # Replace null values with default values + processed_metadata = { + "id_": metadata.get("id_", ""), + "title": metadata.get("title", []) if metadata.get("title") is not None else [], + "author": metadata.get("author", []) + if metadata.get("author") is not None + else [], + "source": metadata.get("source", "") + if metadata.get("source") is not None + else "", + "language": metadata.get("language", "") + if metadata.get("language") is not None + else "", + "initial_creation_type": InitialCreationType(metadata["initial_creation_type"]) + if "initial_creation_type" in metadata + else None, + "created_at": metadata.get("created_at"), + "source_metadata": metadata.get("source_metadata", {}) + if metadata.get("source_metadata") is not None + else {}, + } + return processed_metadata diff --git a/src/openpecha/pecha/annotation.py b/src/openpecha/pecha/annotation.py index c7f37c8..577cddd 100644 --- a/src/openpecha/pecha/annotation.py +++ b/src/openpecha/pecha/annotation.py @@ -1,8 +1,10 @@ from pydantic import BaseModel, Field, ValidationInfo, field_validator +from openpecha.ids import get_uuid + class Annotation(BaseModel): - segment: str + id_: str = Field(default_factory=get_uuid) start: int = Field(ge=0) end: int = Field(ge=0) metadata: dict = Field(default_factory=dict) diff --git a/src/openpecha/pecha/layer.py b/src/openpecha/pecha/layer.py index 671896c..b1ed743 100644 --- a/src/openpecha/pecha/layer.py +++ b/src/openpecha/pecha/layer.py @@ -1,64 +1,155 @@ import json +from collections import defaultdict from enum import Enum from pathlib import Path -from typing import Dict +from typing import Dict, Optional, Tuple -from stam import AnnotationStore, Offset, Selector +from pydantic import BaseModel, ConfigDict, Field +from stam import Annotation as StamAnnotation +from stam import AnnotationDataSet, AnnotationStore, Offset, Selector from openpecha.config import PECHA_ANNOTATION_STORE_ID, PECHA_DATASET_ID -from openpecha.ids import get_uuid +from openpecha.ids import get_fourchar_uuid, get_uuid from openpecha.pecha.annotation import Annotation class LayerEnum(Enum): segment = "Segment" - commentaries = "Commentaries" + commentaries = "Comment" -def get_annotation_category(): - # TODO - # Return annotation category based on the annotation label - return "Structure Type" +class LayerGroupEnum(Enum): + structure_type = "Structure Type" -class Layer: - def __init__(self, annotation_label: LayerEnum, annotations: Dict[str, Annotation]): - self.annotation_label = annotation_label - self.annotations = annotations +def get_annotation_category(layer_type: LayerEnum) -> LayerGroupEnum: + """return the annotation category for the layer label""" + if layer_type == LayerEnum.segment: + return LayerGroupEnum.structure_type + return LayerGroupEnum.structure_type - def covert_to_relative_path(self, json_string: str, export_path: Path): - """convert the absolute path to relative path for base file path in json string""" - json_object = json.loads(json_string) - for resource in json_object["resources"]: - original_path = Path(resource["@include"]) - resource["@include"] = str(original_path.relative_to(export_path)) - return json_object - def write(self, base_file_path: Path, export_path: Path): +class Layer(BaseModel): + id_: str = Field(default_factory=get_fourchar_uuid) + annotation_type: LayerEnum + annotations: Dict[str, Annotation] = defaultdict() + + annotation_store: Optional[AnnotationStore] = None + dataset: Optional[AnnotationDataSet] = None + + model_config = ConfigDict(arbitrary_types_allowed=True) + + @classmethod + def from_path(cls, layer_file_path: Path): + """get annotation label""" + annotation_label = LayerEnum(layer_file_path.stem.split("-")[0]) + layer_id = layer_file_path.stem.split("-")[1] + """ load annotations from json""" + with open(layer_file_path) as f: + json_data = json.load(f) + absolute_base_path = layer_file_path.parents[4] + json_data = convert_relative_to_absolute_path(json_data, absolute_base_path) + annotation_store = AnnotationStore(string=json.dumps(json_data)) + + layer_annotations: Dict[str, Annotation] = {} + for annotation in annotation_store.annotations(): + annotation_id, segment = annotation.id(), str(annotation) + start = annotation.offset().begin().value() + end = annotation.offset().end().value() + layer_annotations[annotation_id] = Annotation( + id_=annotation_id, segment=segment, start=start, end=end + ) + + return Layer( + id_=layer_id, + annotation_type=annotation_label, + annotations=layer_annotations, + annotation_store=annotation_store, + ) + + def get_annotations(self): + if not self.annotation_store: + return None + for ann in self.annotation_store: + yield self.parse_annotation(ann) + + def get_annotation(self, ann_id: str): + if not self.annotation_store: + return None + ann = self.annotation_store.annotation(id=ann_id) + return self.parse_annotation(ann) + + def parse_annotation(self, ann: StamAnnotation): + ann_id = ann.id() + ann_segment = str(ann) + start = ann.offset().begin().value() + end = ann.offset().end().value() + + parsed_ann = {"id": ann_id, "segment": ann_segment, "start": start, "end": end} + + for ann_data in ann: + key, value = ann_data.key().id(), str(ann_data.value()) + if key in LayerGroupEnum._value2member_map_: + parsed_ann["annotation_category"] = key + parsed_ann["annotation_type"] = value + else: + parsed_ann["payloads"] = defaultdict(str) + parsed_ann["payloads"][key] = value + + return parsed_ann + + def set_annotation(self, annotation: Annotation): + self.annotations[annotation.id_] = annotation + + def write(self, base_file_path: Path, output_path: Path): + base_file_path = base_file_path """write annotations in stam data model""" self.annotation_store = AnnotationStore(id=PECHA_ANNOTATION_STORE_ID) - self.resource = self.annotation_store.add_resource( + resource = self.annotation_store.add_resource( id=base_file_path.name, filename=base_file_path.as_posix() ) self.dataset = self.annotation_store.add_dataset(id=PECHA_DATASET_ID) - - annotation_category = get_annotation_category() + annotation_category = get_annotation_category(self.annotation_type).value self.dataset.add_key(annotation_category) - unique_annotation_data_id = get_uuid() + unique_ann_data_id = get_uuid() + ann_data_ids: Dict[Tuple[str, str], str] = {} + for annotation_id, annotation in self.annotations.items(): target = Selector.textselector( - self.resource, + resource, Offset.simple(annotation.start, annotation.end), ) + data = [ { - "id": unique_annotation_data_id, + "id": unique_ann_data_id, "key": annotation_category, - "value": self.annotation_label.value, + "value": self.annotation_type.value, "set": self.dataset.id(), } ] + """ + add metadata to the annotation if exists + if the metadata is already added, get the id from the dictionary, + else create a new id and add to the dictionary + """ + if annotation.metadata: + for key, value in annotation.metadata.items(): + if (key, value) in ann_data_ids: + ann_data_id = ann_data_ids[(key, value)] + else: + ann_data_id = get_uuid() + ann_data_ids[(key, value)] = ann_data_id + data.append( + { + "id": ann_data_id, + "key": key, + "value": value, + "set": self.dataset.id(), + } + ) + self.annotation_store.annotate( id=annotation_id, target=target, @@ -66,11 +157,29 @@ def write(self, base_file_path: Path, export_path: Path): ) """ save annotations in json""" json_string = self.annotation_store.to_json_string() - json_object = self.covert_to_relative_path(json_string, export_path) + json_object = convert_to_relative_path(json_string, output_path) """ add four uuid digits to the layer file name for uniqueness""" - layer_fname = f"{self.annotation_label.value}-{get_uuid()[:4]}.json" + layer_dir = base_file_path.parent.parent / "layers" / base_file_path.stem + layer_file_path = layer_dir / f"{self.annotation_type.value}-{self.id_}.json" with open( - export_path / layer_fname, + layer_file_path, "w", ) as f: f.write(json.dumps(json_object, indent=4, ensure_ascii=False)) + + +def convert_relative_to_absolute_path(json_data, absolute_base_path: Path): + """call after loading the stam from json""" + for resource in json_data["resources"]: + original_path = Path(resource["@include"]) + resource["@include"] = str(absolute_base_path / original_path) + return json_data + + +def convert_to_relative_path(json_string: str, output_path: Path): + """convert the absolute path to relative path for base file path in json string""" + json_object = json.loads(json_string) + for resource in json_object["resources"]: + original_path = Path(resource["@include"]) + resource["@include"] = str(original_path.relative_to(output_path)) + return json_object diff --git a/src/openpecha/pecha/metadata.py b/src/openpecha/pecha/metadata.py new file mode 100644 index 0000000..8220539 --- /dev/null +++ b/src/openpecha/pecha/metadata.py @@ -0,0 +1,84 @@ +import json +from collections import defaultdict +from datetime import datetime +from enum import Enum +from typing import Dict, List, Optional + +from pydantic import BaseModel, Field, field_validator, model_validator + +from openpecha.ids import get_initial_pecha_id + + +class InitialCreationType(Enum): + ocr = "ocr" + ebook = "ebook" + input = "input" + tmx = "tmx" + + +class PechaMetadata(BaseModel): + id_: str = Field(default=None, alias="id_") + title: List[str] = Field(default=list, alias="title") + author: List[str] = Field(default=list, alias="author") + source: str = Field(default=None, alias="source") + language: str = Field(default=None, alias="language") + initial_creation_type: InitialCreationType = Field( + None, alias="initial_creation_type" + ) + created_at: Optional[datetime] = Field(default=None, alias="created_at") + source_metadata: Optional[Dict] = Field( + default=dict + ) # place to dump any metadata from the source + + @field_validator("created_at", mode="before") + def set_imported_date(cls, v): + return v or datetime.now() + + class Config: + json_encoders = { + InitialCreationType: lambda v: v.value, + defaultdict: lambda d: dict(d), + } + + +def to_json_serializable(pecha_metadata: Optional[PechaMetadata]) -> str: + if pecha_metadata is None: + return json.dumps({}, indent=4, ensure_ascii=False) + + dict_data = pecha_metadata.model_dump() + if dict_data["initial_creation_type"] is not None: + dict_data["initial_creation_type"] = dict_data["initial_creation_type"].value + for k, v in dict_data.items(): + if v is list: + dict_data[k] = [] + continue + if v is dict: + dict_data[k] = {} + return json.dumps(dict_data, indent=4, ensure_ascii=False) + + +class InitialPechaMetadata(PechaMetadata): + @model_validator(mode="before") + @classmethod + def set_id(cls, values): + if "id_" not in values or values["id_"] is None: + values["id_"] = get_initial_pecha_id() + return values + + +class OpenPechaMetadata(PechaMetadata): + @model_validator(mode="before") + @classmethod + def set_id(cls, values): + if "id_" not in values or values["id_"] is None: + values["id_"] = get_initial_pecha_id() + return values + + +class DiplomaticPechaMetadata(PechaMetadata): + @model_validator(mode="before") + @classmethod + def set_id(cls, values): + if "id_" not in values or values["id_"] is None: + values["id_"] = get_initial_pecha_id() + return values diff --git a/tests/alignment/parsers/plaintext/test_plaintext.py b/tests/alignment/parsers/plaintext/test_plaintext.py index 60c1aea..91e6a09 100644 --- a/tests/alignment/parsers/plaintext/test_plaintext.py +++ b/tests/alignment/parsers/plaintext/test_plaintext.py @@ -11,17 +11,15 @@ def get_data_dir(): def get_metadata(): return { "source": { - "annotation_category": "Structure Type", - "annotation_label": "Segment", + "annotation_type": "Segment", }, "target": { - "annotation_category": "Structure Type", - "annotation_label": "Comment", + "annotation_type": "Comment", }, } -def test_plaintext_parse(): +def test_PlainTextLineAlignedParser_parse(): DATA_DIR = get_data_dir() source_path = DATA_DIR / "segments.txt" target_path = DATA_DIR / "comments.txt" @@ -30,26 +28,7 @@ def test_plaintext_parse(): plaintext = PlainTextLineAlignedParser.from_files( source_path, target_path, metadata ) - plaintext.parse() - - assert ( - len(plaintext.source_segments) == 5 - ), "plaintext parser is not parsing source_segments correctly" - assert ( - len(plaintext.target_segments) == 5 - ), "plaintext parser is not parsing target_segments correctly" - - -def test_plaintext_save(): - DATA_DIR = get_data_dir() - source_path = DATA_DIR / "segments.txt" - target_path = DATA_DIR / "comments.txt" - - metadata = get_metadata() - plaintext = PlainTextLineAlignedParser.from_files( - source_path, target_path, metadata - ) - source_pecha, target_pecha = plaintext.save() + source_pecha, target_pecha = plaintext.parse() assert isinstance( source_pecha, Pecha diff --git a/tests/pecha/read/data/IE7D6875F/IE7D6875F.opf/base/f2b056668a0c4ad3a085bdcd8e2d7adb.txt b/tests/pecha/read/data/IE7D6875F/IE7D6875F.opf/base/f2b056668a0c4ad3a085bdcd8e2d7adb.txt new file mode 100644 index 0000000..0b166fc --- /dev/null +++ b/tests/pecha/read/data/IE7D6875F/IE7D6875F.opf/base/f2b056668a0c4ad3a085bdcd8e2d7adb.txt @@ -0,0 +1 @@ +རྒྱ་གར་སྐད་དུ། བོ་དྷི་སཏྭ་ཙརྱ་ཨ་བ་ཏཱ་ར།བོད་སྐད་དུ། བྱང་ཆུབ་སེམས་དཔའི་སྤྱོད་པ་ལ་འཇུག་པའི་ལེགས་པར་སྦྱར་བ། \ No newline at end of file diff --git a/tests/pecha/read/data/IE7D6875F/IE7D6875F.opf/layers/f2b056668a0c4ad3a085bdcd8e2d7adb/Segment-bf13.json b/tests/pecha/read/data/IE7D6875F/IE7D6875F.opf/layers/f2b056668a0c4ad3a085bdcd8e2d7adb/Segment-bf13.json new file mode 100644 index 0000000..92bcaec --- /dev/null +++ b/tests/pecha/read/data/IE7D6875F/IE7D6875F.opf/layers/f2b056668a0c4ad3a085bdcd8e2d7adb/Segment-bf13.json @@ -0,0 +1,88 @@ +{ + "@type": "AnnotationStore", + "@id": "PechaAnnotationStore", + "resources": [ + { + "@type": "TextResource", + "@id": "f2b056668a0c4ad3a085bdcd8e2d7adb.txt", + "@include": "IE7D6875F/IE7D6875F.opf/base/f2b056668a0c4ad3a085bdcd8e2d7adb.txt" + } + ], + "annotationsets": [ + { + "@type": "AnnotationDataSet", + "@id": "PechaDataSet", + "keys": [ + { + "@type": "DataKey", + "@id": "Structure Type" + } + ], + "data": [ + { + "@type": "AnnotationData", + "@id": "0c2c4165fb58464eabf9db0d6a3a1080", + "key": "Structure Type", + "value": { + "@type": "String", + "value": "Segment" + } + } + ] + } + ], + "annotations": [ + { + "@type": "Annotation", + "@id": "f2b056668a0c4ad3a085bdcd8e2d7adb", + "target": { + "@type": "TextSelector", + "resource": "f2b056668a0c4ad3a085bdcd8e2d7adb.txt", + "offset": { + "@type": "Offset", + "begin": { + "@type": "BeginAlignedCursor", + "value": 0 + }, + "end": { + "@type": "BeginAlignedCursor", + "value": 39 + } + } + }, + "data": [ + { + "@type": "AnnotationData", + "@id": "0c2c4165fb58464eabf9db0d6a3a1080", + "set": "PechaDataSet" + } + ] + }, + { + "@type": "Annotation", + "@id": "b696df2dbe314e8a87881a2bc391d0d5", + "target": { + "@type": "TextSelector", + "resource": "f2b056668a0c4ad3a085bdcd8e2d7adb.txt", + "offset": { + "@type": "Offset", + "begin": { + "@type": "BeginAlignedCursor", + "value": 39 + }, + "end": { + "@type": "BeginAlignedCursor", + "value": 103 + } + } + }, + "data": [ + { + "@type": "AnnotationData", + "@id": "0c2c4165fb58464eabf9db0d6a3a1080", + "set": "PechaDataSet" + } + ] + } + ] +} \ No newline at end of file diff --git a/tests/pecha/read/data/IE7D6875F/IE7D6875F.opf/metadata.json b/tests/pecha/read/data/IE7D6875F/IE7D6875F.opf/metadata.json new file mode 100644 index 0000000..38be7bc --- /dev/null +++ b/tests/pecha/read/data/IE7D6875F/IE7D6875F.opf/metadata.json @@ -0,0 +1 @@ +"{\n \"id_\": \"IE7D6875F\",\n \"title\": null,\n \"author\": null,\n \"source\": null,\n \"language\": null,\n \"initial_creation_type\": \"input\",\n \"created_at\": null,\n \"source_metadata\": {}\n}" \ No newline at end of file diff --git a/tests/pecha/read/test_pecha_read.py b/tests/pecha/read/test_pecha_read.py new file mode 100644 index 0000000..fbd4f9a --- /dev/null +++ b/tests/pecha/read/test_pecha_read.py @@ -0,0 +1,48 @@ +from pathlib import Path + +from openpecha.pecha import Pecha +from openpecha.pecha.layer import Layer, LayerEnum + + +def test_pecha_read(): + DATA = Path(__file__).parent / "data" + pecha = Pecha.from_path(DATA / "IE7D6875F") + assert pecha.pecha_id == "IE7D6875F" + assert "f2b056668a0c4ad3a085bdcd8e2d7adb" in pecha.bases + assert ( + pecha.bases["f2b056668a0c4ad3a085bdcd8e2d7adb"] + == "རྒྱ་གར་སྐད་དུ། བོ་དྷི་སཏྭ་ཙརྱ་ཨ་བ་ཏཱ་ར།བོད་སྐད་དུ། བྱང་ཆུབ་སེམས་དཔའི་སྤྱོད་པ་ལ་འཇུག་པའི་ལེགས་པར་སྦྱར་བ།" + ) + + for layer_key, layer in pecha.layers["f2b056668a0c4ad3a085bdcd8e2d7adb"].items(): + annotation_type, layer_id = layer_key + assert annotation_type == LayerEnum.segment + assert isinstance(layer_id, str) + assert isinstance(layer, Layer) + + first_layer = pecha.layers["f2b056668a0c4ad3a085bdcd8e2d7adb"][ + (LayerEnum.segment, "bf13") + ] + + annotations = list(first_layer.get_annotations()) + assert annotations == [ + { + "id": "f2b056668a0c4ad3a085bdcd8e2d7adb", + "segment": "རྒྱ་གར་སྐད་དུ། བོ་དྷི་སཏྭ་ཙརྱ་ཨ་བ་ཏཱ་ར།", + "start": 0, + "end": 39, + "annotation_category": "Structure Type", + "annotation_type": "Segment", + }, + { + "id": "b696df2dbe314e8a87881a2bc391d0d5", + "segment": "བོད་སྐད་དུ། བྱང་ཆུབ་སེམས་དཔའི་སྤྱོད་པ་ལ་འཇུག་པའི་ལེགས་པར་སྦྱར་བ།", + "start": 39, + "end": 103, + "annotation_category": "Structure Type", + "annotation_type": "Segment", + }, + ] + + +test_pecha_read() diff --git a/tests/pecha/test_pecha.py b/tests/pecha/test_pecha.py deleted file mode 100644 index 47f8e18..0000000 --- a/tests/pecha/test_pecha.py +++ /dev/null @@ -1,77 +0,0 @@ -from pathlib import Path -from shutil import rmtree - -from openpecha.pecha import Pecha -from openpecha.pecha.annotation import Annotation - - -def get_data_dir(): - export_path = Path(__file__).parent / "data" - export_path.mkdir(parents=True, exist_ok=True) - return export_path - - -def get_segments(): - return { - "f2b056668a0c4ad3a085bdcd8e2d7adb": "རྒྱ་གར་སྐད་དུ། བོ་དྷི་སཏྭ་ཙརྱ་ཨ་བ་ཏཱ་ར།", - "b696df2dbe314e8a87881a2bc391d0d5": "བོད་སྐད་དུ། བྱང་ཆུབ་སེམས་དཔའི་སྤྱོད་པ་ལ་འཇུག་པའི་ལེགས་པར་སྦྱར་བ།", - } - - -def get_metadata(): - return { - "annotation_category": "Structure Type", - "annotation_label": "Segment", - } - - -def get_expected_annotations(): - expected_annotations = [ - Annotation( - id_="f2b056668a0c4ad3a085bdcd8e2d7adb", - segment="རྒྱ་གར་སྐད་དུ། བོ་དྷི་སཏྭ་ཙརྱ་ཨ་བ་ཏཱ་ར།", - start=0, - end=39, - metadata={}, - ), - Annotation( - id_="b696df2dbe314e8a87881a2bc391d0d5", - segment="བོད་སྐད་དུ། བྱང་ཆུབ་སེམས་དཔའི་སྤྱོད་པ་ལ་འཇུག་པའི་ལེགས་པར་སྦྱར་བ།", - start=39, - end=103, - metadata={}, - ), - ] - return expected_annotations - - -def test_pecha_set_annotations(): - pecha_id = "IE7D6875F" - segments = get_segments() - metadata = get_metadata() - pecha = Pecha(pecha_id=pecha_id, segments=segments, metadata=metadata) - assert isinstance( - pecha, Pecha - ), "Not able to create Pecha object with id, segments and metadata" - - annotations = list(pecha.set_annotations()) - assert ( - annotations == get_expected_annotations() - ), "Pecha not able to set annotations for the segments" - - -def test_pecha_write_annotations(): - pecha_id = "IE7D6875F" - segments = get_segments() - metadata = get_metadata() - pecha = Pecha(pecha_id=pecha_id, segments=segments, metadata=metadata) - export_path = get_data_dir() - pecha.write_annotations(export_path=export_path) - assert pecha.base_fn.exists(), "Pecha not able to write base file" - assert pecha.metadata_fn.exists(), "Pecha not able to write metadata file" - assert pecha.annotation_fn.rglob( - "*.json" - ), "Pecha not able to write annotation file" - - """ clean up """ - rmtree(Path(export_path / pecha_id)) diff --git a/tests/pecha/write/expected_output/IE7D6875F/IE7D6875F.opf/base/f2b056668a0c4ad3a085bdcd8e2d7adb.txt b/tests/pecha/write/expected_output/IE7D6875F/IE7D6875F.opf/base/f2b056668a0c4ad3a085bdcd8e2d7adb.txt new file mode 100644 index 0000000..0b166fc --- /dev/null +++ b/tests/pecha/write/expected_output/IE7D6875F/IE7D6875F.opf/base/f2b056668a0c4ad3a085bdcd8e2d7adb.txt @@ -0,0 +1 @@ +རྒྱ་གར་སྐད་དུ། བོ་དྷི་སཏྭ་ཙརྱ་ཨ་བ་ཏཱ་ར།བོད་སྐད་དུ། བྱང་ཆུབ་སེམས་དཔའི་སྤྱོད་པ་ལ་འཇུག་པའི་ལེགས་པར་སྦྱར་བ། \ No newline at end of file diff --git a/tests/pecha/write/expected_output/IE7D6875F/IE7D6875F.opf/layers/f2b056668a0c4ad3a085bdcd8e2d7adb/Segment-bf13.json b/tests/pecha/write/expected_output/IE7D6875F/IE7D6875F.opf/layers/f2b056668a0c4ad3a085bdcd8e2d7adb/Segment-bf13.json new file mode 100644 index 0000000..92bcaec --- /dev/null +++ b/tests/pecha/write/expected_output/IE7D6875F/IE7D6875F.opf/layers/f2b056668a0c4ad3a085bdcd8e2d7adb/Segment-bf13.json @@ -0,0 +1,88 @@ +{ + "@type": "AnnotationStore", + "@id": "PechaAnnotationStore", + "resources": [ + { + "@type": "TextResource", + "@id": "f2b056668a0c4ad3a085bdcd8e2d7adb.txt", + "@include": "IE7D6875F/IE7D6875F.opf/base/f2b056668a0c4ad3a085bdcd8e2d7adb.txt" + } + ], + "annotationsets": [ + { + "@type": "AnnotationDataSet", + "@id": "PechaDataSet", + "keys": [ + { + "@type": "DataKey", + "@id": "Structure Type" + } + ], + "data": [ + { + "@type": "AnnotationData", + "@id": "0c2c4165fb58464eabf9db0d6a3a1080", + "key": "Structure Type", + "value": { + "@type": "String", + "value": "Segment" + } + } + ] + } + ], + "annotations": [ + { + "@type": "Annotation", + "@id": "f2b056668a0c4ad3a085bdcd8e2d7adb", + "target": { + "@type": "TextSelector", + "resource": "f2b056668a0c4ad3a085bdcd8e2d7adb.txt", + "offset": { + "@type": "Offset", + "begin": { + "@type": "BeginAlignedCursor", + "value": 0 + }, + "end": { + "@type": "BeginAlignedCursor", + "value": 39 + } + } + }, + "data": [ + { + "@type": "AnnotationData", + "@id": "0c2c4165fb58464eabf9db0d6a3a1080", + "set": "PechaDataSet" + } + ] + }, + { + "@type": "Annotation", + "@id": "b696df2dbe314e8a87881a2bc391d0d5", + "target": { + "@type": "TextSelector", + "resource": "f2b056668a0c4ad3a085bdcd8e2d7adb.txt", + "offset": { + "@type": "Offset", + "begin": { + "@type": "BeginAlignedCursor", + "value": 39 + }, + "end": { + "@type": "BeginAlignedCursor", + "value": 103 + } + } + }, + "data": [ + { + "@type": "AnnotationData", + "@id": "0c2c4165fb58464eabf9db0d6a3a1080", + "set": "PechaDataSet" + } + ] + } + ] +} \ No newline at end of file diff --git a/tests/pecha/write/expected_output/IE7D6875F/IE7D6875F.opf/metadata.json b/tests/pecha/write/expected_output/IE7D6875F/IE7D6875F.opf/metadata.json new file mode 100644 index 0000000..cb740ab --- /dev/null +++ b/tests/pecha/write/expected_output/IE7D6875F/IE7D6875F.opf/metadata.json @@ -0,0 +1,3 @@ +{ + "annotation_label": "Segment" +} \ No newline at end of file diff --git a/tests/pecha/write/test_pecha.py b/tests/pecha/write/test_pecha.py new file mode 100644 index 0000000..c905b80 --- /dev/null +++ b/tests/pecha/write/test_pecha.py @@ -0,0 +1,88 @@ +from pathlib import Path +from shutil import rmtree +from unittest import mock + +from openpecha.pecha import Pecha +from openpecha.pecha.annotation import Annotation +from openpecha.pecha.layer import Layer, LayerEnum +from openpecha.pecha.metadata import InitialCreationType, InitialPechaMetadata + + +def get_data_dir(): + output_path = Path(__file__).parent / "output" + output_path.mkdir(parents=True, exist_ok=True) + return output_path + + +def get_metadata(): + return { + "annotation_type": "Segment", + } + + +def get_base(): + return { + "f2b056668a0c4ad3a085bdcd8e2d7adb": "རྒྱ་གར་སྐད་དུ། བོ་དྷི་སཏྭ་ཙརྱ་ཨ་བ་ཏཱ་ར།བོད་སྐད་དུ། བྱང་ཆུབ་སེམས་དཔའི་སྤྱོད་པ་ལ་འཇུག་པའི་ལེགས་པར་སྦྱར་བ།" # noqa + } + + +def get_layer(): + return { + "f2b056668a0c4ad3a085bdcd8e2d7adb": { + (LayerEnum.segment, "bf13"): Layer( + id_="bf13", + annotation_type=LayerEnum("Segment"), + annotations=get_annotations(), + ) + } + } + + +def get_annotations(): + return { + "f2b056668a0c4ad3a085bdcd8e2d7adb": Annotation( + start=0, + end=39, + metadata={}, + ), + "b696df2dbe314e8a87881a2bc391d0d5": Annotation( + start=39, + end=103, + metadata={}, + ), + } + + +def test_pecha_write(): + with mock.patch( + "openpecha.pecha.metadata.get_initial_pecha_id" + ) as mock_get_initial_pecha_id: + mock_get_initial_pecha_id.return_value = "IE7D6875F" + base = get_base() + layer = get_layer() + output_path = get_data_dir() + expected_output_path = Path(__file__).parent / "expected_output" + + metadata = InitialPechaMetadata(initial_creation_type=InitialCreationType.input) + pecha = Pecha(metadata=metadata) + pecha.bases = base + pecha.layers = layer + + pecha.write(output_path=output_path) + + output_file_names = [file.name for file in list(output_path.rglob("*"))] + expected_file_names = [ + file.name for file in list(expected_output_path.rglob("*")) + ] + + """ sort the list """ + output_file_names.sort() + expected_file_names.sort() + + assert output_file_names == expected_file_names + + """ clean up """ + rmtree(output_path) + + +test_pecha_write()