Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat/write pecha annotation #3

Closed
wants to merge 31 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
12878b8
fix/layer name in Layer write function
tenzin3 Jul 4, 2024
2dcf475
moodify LayerEnum value
tenzin3 Jul 4, 2024
2558f19
modify/meta data defination in test_plaintext
tenzin3 Jul 4, 2024
d210716
test for pecha write function
tenzin3 Jul 4, 2024
7437fb0
raise Value Error if annotation segment doesnt match the base text
tenzin3 Jul 4, 2024
57991fe
Layer function set annotation
tenzin3 Jul 4, 2024
6879c9b
Pecha set base file, layer and metadata file
tenzin3 Jul 4, 2024
1eb9be4
refactor
tenzin3 Jul 4, 2024
4729094
refactor code
tenzin3 Jul 4, 2024
40b9781
modify/set Tuple of LayerEnum and str as Layer key
tenzin3 Jul 4, 2024
ba8c3c9
get annotation category
tenzin3 Jul 4, 2024
f22e47e
refactor test folder structure
tenzin3 Jul 4, 2024
ee9cc07
modify/annotation_label -> annotation_type
tenzin3 Jul 5, 2024
3a155af
fix/passing empty dict
tenzin3 Jul 5, 2024
1b07070
delete segment attribute from Annotation class
tenzin3 Jul 5, 2024
8f525ce
modify/set basefile name in set_base_file function
tenzin3 Jul 5, 2024
3218572
modify layer_label -> layer_type
tenzin3 Jul 5, 2024
4ca26d0
modify/export_path -> output_path
tenzin3 Jul 5, 2024
5ca3fcb
modify/base_fname -> base_name
tenzin3 Jul 5, 2024
7c9663f
add id_ attribute to Annotation class
tenzin3 Jul 5, 2024
a606ea0
add id_ attribute to Layer class
tenzin3 Jul 5, 2024
869d207
modify/ layer_dir -> base_name
tenzin3 Jul 5, 2024
3b6f1b8
modify layer_id -> layer_subtype_id
tenzin3 Jul 5, 2024
0f034d9
create PechaMetadata
tenzin3 Jul 8, 2024
3d188ec
instate Pecha with metadata
tenzin3 Jul 8, 2024
15fee96
fix field_validator attribute
tenzin3 Jul 8, 2024
753c721
fix pydantic validator
tenzin3 Jul 8, 2024
20cfefb
pass metadata to Pechadata
tenzin3 Jul 8, 2024
d7bdb30
set pecha_id if not in metadata
tenzin3 Jul 8, 2024
2d14de7
make PechaMetadata json serializable
tenzin3 Jul 8, 2024
9f7a01a
modify test_pecha instantiate pecha with metadata
tenzin3 Jul 8, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ classifiers = [
dependencies = [
"pydantic >= 2.7.4",
"stam == 0.8.2",
"collection >= 0.1.6",

]

Expand Down
67 changes: 30 additions & 37 deletions src/openpecha/alignment/parsers/plaintext.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
from pathlib import Path
from typing import Dict
from typing import List

from openpecha.ids import get_initial_pecha_id, get_uuid
from openpecha.pecha import Pecha
from openpecha.pecha.annotation import Annotation
from openpecha.pecha.layer import Layer, LayerEnum
from openpecha.pecha.metadata import InitialCreationType, InitialPechaMetadata


class PlainTextLineAlignedParser:
Expand All @@ -19,55 +19,48 @@ def from_files(cls, source_path: Path, target_path: Path, metadata: dict):
target_text = target_path.read_text(encoding="utf-8")
return cls(source_text, target_text, metadata)

def create_pecha_layer(self, base_text: str, annotation: LayerEnum):
def create_pecha_layer(self, segments: List[str], annotation_type: LayerEnum):
""" """
layer_annotations: Dict[str, Annotation] = {}
layer = Layer(annotation_type=annotation_type)
char_count = 0
for segment in base_text.split("\n"):
layer_annotations[get_uuid()] = Annotation(
id_=get_uuid(),
segment=segment,
for segment in segments:
annotation = Annotation(
start=char_count,
end=char_count + len(segment),
)
layer.set_annotation(annotation)
char_count += len(segment)

return Layer(annotation_label=annotation, annotations=layer_annotations)
return layer

def parse(self):
source_pecha_id, target_pecha_id = (
get_initial_pecha_id(),
get_initial_pecha_id(),
source_pecha_metadata, target_pecha_metadata = (
InitialPechaMetadata(
initial_creation_type=InitialCreationType.input,
source_metadata=self.metadata["source"],
),
InitialPechaMetadata(
initial_creation_type=InitialCreationType.input,
source_metadata=self.metadata["target"],
),
)
source_pecha = Pecha(metadata=source_pecha_metadata)
target_pecha = Pecha(metadata=target_pecha_metadata)

source_base_fname, target_base_fname = get_uuid(), get_uuid()
source_base_files = {source_base_fname: self.source_text}
target_base_files = {target_base_fname: self.target_text}
source_base_name = source_pecha.set_base_file(self.source_text)
target_base_name = target_pecha.set_base_file(self.target_text)

source_annotation = LayerEnum(self.metadata["source"]["annotation_label"])
target_annotation = LayerEnum(self.metadata["target"]["annotation_label"])

source_layers = {
source_base_fname: {
source_annotation: self.create_pecha_layer(
self.source_text, source_annotation
)
}
}
target_layers = {
target_base_fname: {
target_annotation: self.create_pecha_layer(
self.target_text, target_annotation
),
}
}

source_pecha = Pecha( # noqa
source_pecha_id, source_base_files, source_layers, self.metadata["source"]
source_pecha.set_layer(
source_base_name,
LayerEnum.segment,
self.create_pecha_layer(self.source_text.split("\n"), LayerEnum.segment),
)
target_pecha = Pecha( # noqa
target_pecha_id, target_base_files, target_layers, self.metadata["target"]
target_pecha.set_layer(
target_base_name,
LayerEnum.segment,
self.create_pecha_layer(self.target_text.split("\n"), LayerEnum.segment),
)

return source_pecha, target_pecha

# TODO:
Expand Down
4 changes: 4 additions & 0 deletions src/openpecha/ids.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@ def get_uuid():
return uuid4().hex


def get_fourchar_uuid():
return get_uuid()[:4]


def get_id(prefix, length):
return prefix + "".join(random.choices(uuid4().hex, k=length)).upper()

Expand Down
82 changes: 49 additions & 33 deletions src/openpecha/pecha/__init__.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,25 @@
import json
from collections import defaultdict
from pathlib import Path
from shutil import rmtree
from typing import Dict
from typing import Dict, Optional, Tuple

from stam import AnnotationStore, Offset, Selector

from openpecha.config import (
PECHA_ANNOTATION_STORE_ID,
PECHA_DATASET_ID,
PECHAS_PATH,
_mkdir,
)
from openpecha.config import PECHAS_PATH, _mkdir
from openpecha.ids import get_uuid
from openpecha.pecha.annotation import Annotation
from openpecha.pecha.layer import Layer, LayerEnum
from openpecha.pecha.metadata import PechaMetadata, to_json_serializable


class Pecha:
def __init__(
self,
pecha_id: str,
bases: Dict[str, str],
layers: Dict[str, Dict[LayerEnum, Layer]],
metadata: Dict[str, str],
pecha_id: str = None,
bases: Dict[str, str] = defaultdict(),
layers: Dict[str, Dict[Tuple[LayerEnum, str], Layer]] = defaultdict(
lambda: defaultdict()
),
metadata: PechaMetadata = None,
) -> None:
self.pecha_id = pecha_id
self.pecha_id = metadata.id_ if metadata else pecha_id
self.bases = bases
self.layers = layers
self.metadata = metadata
Expand All @@ -37,28 +32,49 @@ def from_path(cls, path: str):
def from_id(cls, pecha_id: str):
pass

def write(self, export_path: Path = PECHAS_PATH):
def set_base_file(self, base_text: str) -> str:
base_file_name = get_uuid()
self.bases[base_file_name] = base_text
return base_file_name

def set_layer(
self, base_name: str, annotation_type: LayerEnum, layer: Layer
) -> str:

"""layer key is a tuple of layer label and layer id"""
""" A particular volume can have multiple layers with same label but different id"""
layer_subtype_id = get_uuid()[:4]
self.layers[base_name][(annotation_type, layer_subtype_id)] = layer
return layer_subtype_id

pecha_dir = _mkdir(export_path / self.pecha_id)
def write(self, output_path: Path = PECHAS_PATH):
if not self.pecha_id:
raise ValueError("pecha_id must be set before writing.")

pecha_dir = _mkdir(output_path / self.pecha_id)
self.base_path = _mkdir(pecha_dir / f"{self.pecha_id}.opf")
""" write metadata """
self.metadata_fn = self.base_path / "metadata.json"
self.metadata_fn.write_text(
json.dumps(self.metadata, indent=4, ensure_ascii=False), encoding="utf-8"
json.dumps(
to_json_serializable(self.metadata), indent=4, ensure_ascii=False
),
encoding="utf-8",
)

""" write base file"""
base_dir = _mkdir(self.base_path / "base")
for base_fname, base_text in self.bases.items():
base_fn = base_dir / f"{base_fname}.txt"
base_fn.write_text(base_text, encoding="utf-8")

layer_dir = _mkdir(self.base_path / "layers")
""" write annotation layers"""
for layer_fname, layer_data in self.layers.items():
for _, layer in layer_data.items():
_mkdir(layer_dir / layer_fname)
layer.write(
base_file_path=base_dir / layer_fname,
export_path=layer_dir / layer_fname,
)
if self.bases:
base_dir = _mkdir(self.base_path / "base")
for base_name, base_text in self.bases.items():
base_fn = base_dir / f"{base_name}.txt"
base_fn.write_text(base_text, encoding="utf-8")
if self.layers:
layer_dir = _mkdir(self.base_path / "layers")
""" write annotation layers"""
for layer_name, layer_data in self.layers.items():
for _, layer in layer_data.items():
_mkdir(layer_dir / layer_name)
layer.write(
base_file_path=base_dir / f"{layer_name}.txt",
output_path=output_path,
)
4 changes: 3 additions & 1 deletion src/openpecha/pecha/annotation.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from pydantic import BaseModel, Field, ValidationInfo, field_validator

from openpecha.ids import get_uuid


class Annotation(BaseModel):
segment: str
id_: str = Field(default_factory=get_uuid)
start: int = Field(ge=0)
end: int = Field(ge=0)
metadata: dict = Field(default_factory=dict)
Expand Down
63 changes: 39 additions & 24 deletions src/openpecha/pecha/layer.py
Original file line number Diff line number Diff line change
@@ -1,61 +1,75 @@
import json
from collections import defaultdict
from enum import Enum
from pathlib import Path
from typing import Dict
from typing import Dict, Optional

from stam import AnnotationStore, Offset, Selector
from pydantic import BaseModel, ConfigDict, Field
from stam import AnnotationDataSet, AnnotationStore, Offset, Selector

from openpecha.config import PECHA_ANNOTATION_STORE_ID, PECHA_DATASET_ID
from openpecha.ids import get_uuid
from openpecha.ids import get_fourchar_uuid, get_uuid
from openpecha.pecha.annotation import Annotation


class LayerEnum(Enum):
segment = "Segment"
commentaries = "Commentaries"
commentaries = "Comment"


def get_annotation_category():
# TODO
# Return annotation category based on the annotation label
return "Structure Type"
class LayerGroupEnum(Enum):
structure_type = "Structure Type"


class Layer:
def __init__(self, annotation_label: LayerEnum, annotations: Dict[str, Annotation]):
self.annotation_label = annotation_label
self.annotations = annotations
def get_annotation_category(layer_type: LayerEnum) -> LayerGroupEnum:
"""return the annotation category for the layer label"""
if layer_type == LayerEnum.segment:
return LayerGroupEnum.structure_type
return LayerGroupEnum.structure_type

def covert_to_relative_path(self, json_string: str, export_path: Path):

class Layer(BaseModel):
id_: str = Field(default_factory=get_fourchar_uuid)
annotation_type: LayerEnum
annotations: Dict[str, Annotation] = defaultdict()

annotation_store: Optional[AnnotationStore] = None
dataset: Optional[AnnotationDataSet] = None

model_config = ConfigDict(arbitrary_types_allowed=True)

def set_annotation(self, annotation: Annotation):
self.annotations[annotation.id_] = annotation

def covert_to_relative_path(self, json_string: str, output_path: Path):
"""convert the absolute path to relative path for base file path in json string"""
json_object = json.loads(json_string)
for resource in json_object["resources"]:
original_path = Path(resource["@include"])
resource["@include"] = str(original_path.relative_to(export_path))
resource["@include"] = str(original_path.relative_to(output_path))
return json_object

def write(self, base_file_path: Path, export_path: Path):
def write(self, base_file_path: Path, output_path: Path):
base_file_path = base_file_path
"""write annotations in stam data model"""
self.annotation_store = AnnotationStore(id=PECHA_ANNOTATION_STORE_ID)
self.resource = self.annotation_store.add_resource(
resource = self.annotation_store.add_resource(
id=base_file_path.name, filename=base_file_path.as_posix()
)
self.dataset = self.annotation_store.add_dataset(id=PECHA_DATASET_ID)

annotation_category = get_annotation_category()
annotation_category = get_annotation_category(self.annotation_type).value
self.dataset.add_key(annotation_category)

unique_annotation_data_id = get_uuid()
for annotation_id, annotation in self.annotations.items():
target = Selector.textselector(
self.resource,
resource,
Offset.simple(annotation.start, annotation.end),
)
data = [
{
"id": unique_annotation_data_id,
"key": annotation_category,
"value": self.annotation_label.value,
"value": self.annotation_type.value,
"set": self.dataset.id(),
}
]
Expand All @@ -66,11 +80,12 @@ def write(self, base_file_path: Path, export_path: Path):
)
""" save annotations in json"""
json_string = self.annotation_store.to_json_string()
json_object = self.covert_to_relative_path(json_string, export_path)
json_object = self.covert_to_relative_path(json_string, output_path)
""" add four uuid digits to the layer file name for uniqueness"""
layer_fname = f"{self.annotation_label.value}-{get_uuid()[:4]}.json"
layer_dir = base_file_path.parent.parent / "layers" / base_file_path.stem
layer_file_path = layer_dir / f"{self.annotation_type.value}-{self.id_}.json"
with open(
export_path / layer_fname,
layer_file_path,
"w",
) as f:
f.write(json.dumps(json_object, indent=4, ensure_ascii=False))
Loading
Loading