Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat/read pecha annotation #5

Closed
wants to merge 50 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
50 commits
Select commit Hold shift + click to select a range
12878b8
fix/layer name in Layer write function
tenzin3 Jul 4, 2024
2dcf475
moodify LayerEnum value
tenzin3 Jul 4, 2024
2558f19
modify/meta data defination in test_plaintext
tenzin3 Jul 4, 2024
d210716
test for pecha write function
tenzin3 Jul 4, 2024
7437fb0
raise Value Error if annotation segment doesnt match the base text
tenzin3 Jul 4, 2024
57991fe
Layer function set annotation
tenzin3 Jul 4, 2024
6879c9b
Pecha set base file, layer and metadata file
tenzin3 Jul 4, 2024
1eb9be4
refactor
tenzin3 Jul 4, 2024
4729094
refactor code
tenzin3 Jul 4, 2024
40b9781
modify/set Tuple of LayerEnum and str as Layer key
tenzin3 Jul 4, 2024
ba8c3c9
get annotation category
tenzin3 Jul 4, 2024
f22e47e
refactor test folder structure
tenzin3 Jul 4, 2024
11f79bb
Layer class method from_path
tenzin3 Jul 4, 2024
fdb37ab
Pecha classmethod from_path
tenzin3 Jul 4, 2024
a28eb5c
fix/stam function to_json_string
tenzin3 Jul 5, 2024
ee9cc07
modify/annotation_label -> annotation_type
tenzin3 Jul 5, 2024
3a155af
fix/passing empty dict
tenzin3 Jul 5, 2024
1b07070
delete segment attribute from Annotation class
tenzin3 Jul 5, 2024
8f525ce
modify/set basefile name in set_base_file function
tenzin3 Jul 5, 2024
3218572
modify layer_label -> layer_type
tenzin3 Jul 5, 2024
4ca26d0
modify/export_path -> output_path
tenzin3 Jul 5, 2024
5ca3fcb
modify/base_fname -> base_name
tenzin3 Jul 5, 2024
7c9663f
add id_ attribute to Annotation class
tenzin3 Jul 5, 2024
a606ea0
add id_ attribute to Layer class
tenzin3 Jul 5, 2024
869d207
modify/ layer_dir -> base_name
tenzin3 Jul 5, 2024
3b6f1b8
modify layer_id -> layer_subtype_id
tenzin3 Jul 5, 2024
0f034d9
create PechaMetadata
tenzin3 Jul 8, 2024
3d188ec
instate Pecha with metadata
tenzin3 Jul 8, 2024
15fee96
fix field_validator attribute
tenzin3 Jul 8, 2024
753c721
fix pydantic validator
tenzin3 Jul 8, 2024
20cfefb
pass metadata to Pechadata
tenzin3 Jul 8, 2024
d7bdb30
set pecha_id if not in metadata
tenzin3 Jul 8, 2024
2d14de7
make PechaMetadata json serializable
tenzin3 Jul 8, 2024
9f7a01a
modify test_pecha instantiate pecha with metadata
tenzin3 Jul 8, 2024
d9461ef
merge with feat/write-pecha-annotation
tenzin3 Jul 8, 2024
7c253d2
set base and layer from class method from_path Pecha
tenzin3 Jul 8, 2024
ba5e4d0
refactor code
tenzin3 Jul 8, 2024
029e6ad
create test_pecha_read
tenzin3 Jul 8, 2024
72ef523
fix/set annotation id in layer classmethod from path
tenzin3 Jul 8, 2024
2f28966
Layer get_annotations
tenzin3 Jul 8, 2024
7839274
add/ annotation metadata in parse_annotation
tenzin3 Jul 8, 2024
fba97e2
write ann metadata to stam if exist
tenzin3 Jul 8, 2024
2ca71a5
read pecha metadata from_path
tenzin3 Jul 8, 2024
3a2e5af
modify path assignment
tenzin3 Jul 10, 2024
cfc2414
upload files to github repo
tenzin3 Jul 10, 2024
a03050c
delete unneccessary lines
tenzin3 Jul 10, 2024
32e10e6
Pecha classmethod from_id
tenzin3 Jul 10, 2024
f13cc29
setup a dummy GITHUB_TOKEN
tenzin3 Jul 10, 2024
961cef2
update CI
tenzin3 Jul 10, 2024
154514a
update CI
tenzin3 Jul 10, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 9 additions & 5 deletions .github/workflows/CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,20 +16,24 @@ jobs:

steps:
- uses: actions/checkout@v3

- name: Set up Python 3.8
uses: actions/setup-python@v3
with:
python-version: "3.8"

- name: Install dependencies
run: |
pip install -U pip
pip install .
pip install .[dev]

- name: Test with pytest
run: PYTHONPATH=src pytest

env:
GITHUB_TOKEN: " "

- name: Test Coverage
run: PYTHONPATH=src pytest --cov project_name
run: PYTHONPATH=src pytest --cov openpecha
env:
GITHUB_TOKEN: " "
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@ classifiers = [
dependencies = [
"pydantic >= 2.7.4",
"stam == 0.8.2",

"collection >= 0.1.6",
"PyGithub >= 2.3.0",
]

[project.optional-dependencies]
Expand Down
67 changes: 30 additions & 37 deletions src/openpecha/alignment/parsers/plaintext.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
from pathlib import Path
from typing import Dict
from typing import List

from openpecha.ids import get_initial_pecha_id, get_uuid
from openpecha.pecha import Pecha
from openpecha.pecha.annotation import Annotation
from openpecha.pecha.layer import Layer, LayerEnum
from openpecha.pecha.metadata import InitialCreationType, InitialPechaMetadata


class PlainTextLineAlignedParser:
Expand All @@ -19,55 +19,48 @@ def from_files(cls, source_path: Path, target_path: Path, metadata: dict):
target_text = target_path.read_text(encoding="utf-8")
return cls(source_text, target_text, metadata)

def create_pecha_layer(self, base_text: str, annotation: LayerEnum):
def create_pecha_layer(self, segments: List[str], annotation_type: LayerEnum):
""" """
layer_annotations: Dict[str, Annotation] = {}
layer = Layer(annotation_type=annotation_type)
char_count = 0
for segment in base_text.split("\n"):
layer_annotations[get_uuid()] = Annotation(
id_=get_uuid(),
segment=segment,
for segment in segments:
annotation = Annotation(
start=char_count,
end=char_count + len(segment),
)
layer.set_annotation(annotation)
char_count += len(segment)

return Layer(annotation_label=annotation, annotations=layer_annotations)
return layer

def parse(self):
source_pecha_id, target_pecha_id = (
get_initial_pecha_id(),
get_initial_pecha_id(),
source_pecha_metadata, target_pecha_metadata = (
InitialPechaMetadata(
initial_creation_type=InitialCreationType.input,
source_metadata=self.metadata["source"],
),
InitialPechaMetadata(
initial_creation_type=InitialCreationType.input,
source_metadata=self.metadata["target"],
),
)
source_pecha = Pecha(metadata=source_pecha_metadata)
target_pecha = Pecha(metadata=target_pecha_metadata)

source_base_fname, target_base_fname = get_uuid(), get_uuid()
source_base_files = {source_base_fname: self.source_text}
target_base_files = {target_base_fname: self.target_text}
source_base_name = source_pecha.set_base_file(self.source_text)
target_base_name = target_pecha.set_base_file(self.target_text)

source_annotation = LayerEnum(self.metadata["source"]["annotation_label"])
target_annotation = LayerEnum(self.metadata["target"]["annotation_label"])

source_layers = {
source_base_fname: {
source_annotation: self.create_pecha_layer(
self.source_text, source_annotation
)
}
}
target_layers = {
target_base_fname: {
target_annotation: self.create_pecha_layer(
self.target_text, target_annotation
),
}
}

source_pecha = Pecha( # noqa
source_pecha_id, source_base_files, source_layers, self.metadata["source"]
source_pecha.set_layer(
source_base_name,
LayerEnum.segment,
self.create_pecha_layer(self.source_text.split("\n"), LayerEnum.segment),
)
target_pecha = Pecha( # noqa
target_pecha_id, target_base_files, target_layers, self.metadata["target"]
target_pecha.set_layer(
target_base_name,
LayerEnum.segment,
self.create_pecha_layer(self.target_text.split("\n"), LayerEnum.segment),
)

return source_pecha, target_pecha

# TODO:
Expand Down
2 changes: 2 additions & 0 deletions src/openpecha/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ def _mkdir(path):
return path


ORG_NAME = "PechaData"

BASE_PATH = _mkdir(Path.home() / ".pechadata")
PECHAS_PATH = _mkdir(BASE_PATH / "pechas")

Expand Down
59 changes: 59 additions & 0 deletions src/openpecha/github_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import os
import subprocess
from pathlib import Path
from shutil import rmtree

from github import Github, GithubException

from openpecha.config import ORG_NAME

GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
if not GITHUB_TOKEN:
raise Exception("GITHUB_TOKEN is not set in the environment.")


def create_github_repo(repo_name: str):
try:
g = Github(GITHUB_TOKEN)
org = g.get_organization(ORG_NAME)
org.create_repo(repo_name)

except GithubException as e:
raise GithubException(f"Error creating repo: {e}")


def upload_files_to_github_repo(repo_name: str, folder_path: Path):
try:
g = Github(GITHUB_TOKEN)
org = g.get_organization(ORG_NAME)
repo = org.get_repo(repo_name)

for file in folder_path.rglob("*"):
if file.is_dir():
continue
file_path = file.relative_to(folder_path)
with open(file) as f:
content = f.read()
repo.create_file(str(file_path), f"committing {file.name}", content)
except GithubException as e:
raise GithubException(f"Error uploading files to github: {e}")


def clone_github_repo(repo_name: str, destination_folder: Path):
repo_path = destination_folder / repo_name
if repo_path.exists():
rmtree(repo_path)
else:
try:
repo_url = f"https://github.com/{ORG_NAME}/{repo_name}.git"
env = {"GIT_ASKPASS": "echo", "GIT_PASSWORD": GITHUB_TOKEN}
subprocess.run(
["git", "clone", repo_url, str(repo_path)],
check=True,
capture_output=True,
env={k: str(v) for k, v in env.items()},
)
return repo_path
except subprocess.CalledProcessError as e:
print(f"Error cloning {repo_name} repository: {e}")
return None
4 changes: 4 additions & 0 deletions src/openpecha/ids.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@ def get_uuid():
return uuid4().hex


def get_fourchar_uuid():
return get_uuid()[:4]


def get_id(prefix, length):
return prefix + "".join(random.choices(uuid4().hex, k=length)).upper()

Expand Down
146 changes: 109 additions & 37 deletions src/openpecha/pecha/__init__.py
Original file line number Diff line number Diff line change
@@ -1,64 +1,136 @@
import json
from collections import defaultdict
from pathlib import Path
from shutil import rmtree
from typing import Dict
from typing import Dict, Optional, Tuple

from stam import AnnotationStore, Offset, Selector

from openpecha.config import (
PECHA_ANNOTATION_STORE_ID,
PECHA_DATASET_ID,
PECHAS_PATH,
_mkdir,
)
from openpecha.config import PECHAS_PATH, _mkdir
from openpecha.github_utils import clone_github_repo
from openpecha.ids import get_uuid
from openpecha.pecha.annotation import Annotation
from openpecha.pecha.layer import Layer, LayerEnum
from openpecha.pecha.metadata import (
InitialCreationType,
PechaMetadata,
to_json_serializable,
)


class Pecha:
def __init__(
self,
pecha_id: str,
bases: Dict[str, str],
layers: Dict[str, Dict[LayerEnum, Layer]],
metadata: Dict[str, str],
pecha_id: str = None,
bases: Dict[str, str] = defaultdict(str),
layers: Dict[str, Dict[Tuple[LayerEnum, str], Layer]] = defaultdict(
lambda: defaultdict()
),
metadata: PechaMetadata = None,
) -> None:
self.pecha_id = pecha_id
self.pecha_id = metadata.id_ if metadata else pecha_id
self.bases = bases
self.layers = layers
self.metadata = metadata

@classmethod
def from_path(cls, path: str):
pass
def from_path(cls, pecha_path: Path):
pecha_id = pecha_path.stem
base_path = pecha_path / f"{pecha_id}.opf"
with open(base_path / "metadata.json", encoding="utf-8") as f:
metadata = json.load(f)
metadata = json.loads(metadata)

preprocessed_meta = preprocess_metadata(metadata)
pecha_metadata = PechaMetadata(**preprocessed_meta)
pecha = Pecha(metadata=pecha_metadata)
pecha.pecha_path = pecha_path

for base_file in (base_path / "base").rglob("*"):
base_text = base_file.read_text(encoding="utf-8")
pecha.set_base_file(base_text, base_file.stem)

for layer_dir in (base_path / "layers").iterdir():
for layer_file in layer_dir.glob("*.json"):
layer = Layer.from_path(layer_file)
pecha.set_layer(layer_dir.stem, layer.annotation_type, layer, layer.id_)

return pecha

@classmethod
def from_id(cls, pecha_id: str):
pass
repo_path = clone_github_repo(pecha_id, PECHAS_PATH)
return cls.from_path(repo_path)

def write(self, export_path: Path = PECHAS_PATH):
def set_base_file(self, base_text: str, base_file_name: str = None) -> str:
base_file_name = base_file_name if base_file_name else get_uuid()[:4]
self.bases[base_file_name] = base_text
return base_file_name

pecha_dir = _mkdir(export_path / self.pecha_id)
self.base_path = _mkdir(pecha_dir / f"{self.pecha_id}.opf")
def set_layer(
self,
base_name: str,
annotation_type: LayerEnum,
layer: Layer,
layer_subtype_id: str = None,
) -> str:

"""layer key is a tuple of layer label and layer id"""
""" A particular volume can have multiple layers with same label but different id"""
layer_subtype_id = get_uuid()[:4] if not layer_subtype_id else layer_subtype_id
self.layers[base_name][(annotation_type, layer_subtype_id)] = layer
return layer_subtype_id

def write(self, output_path: Path = PECHAS_PATH):
if not self.pecha_id:
raise ValueError("pecha_id must be set before writing.")

self.pecha_path = _mkdir(output_path / self.pecha_id)

self.base_path = _mkdir(self.pecha_path / f"{self.pecha_id}.opf")
""" write metadata """
self.metadata_fn = self.base_path / "metadata.json"
self.metadata_fn.write_text(
json.dumps(self.metadata, indent=4, ensure_ascii=False), encoding="utf-8"
json.dumps(
to_json_serializable(self.metadata), indent=4, ensure_ascii=False
),
encoding="utf-8",
)

""" write base file"""
base_dir = _mkdir(self.base_path / "base")
for base_fname, base_text in self.bases.items():
base_fn = base_dir / f"{base_fname}.txt"
base_fn.write_text(base_text, encoding="utf-8")

layer_dir = _mkdir(self.base_path / "layers")
""" write annotation layers"""
for layer_fname, layer_data in self.layers.items():
for _, layer in layer_data.items():
_mkdir(layer_dir / layer_fname)
layer.write(
base_file_path=base_dir / layer_fname,
export_path=layer_dir / layer_fname,
)
if self.bases:
base_dir = _mkdir(self.base_path / "base")
for base_name, base_text in self.bases.items():
base_fn = base_dir / f"{base_name}.txt"
base_fn.write_text(base_text, encoding="utf-8")
if self.layers:
layer_dir = _mkdir(self.base_path / "layers")
""" write annotation layers"""
for layer_name, layer_data in self.layers.items():
for _, layer in layer_data.items():
_mkdir(layer_dir / layer_name)
layer.write(
base_file_path=base_dir / f"{layer_name}.txt",
output_path=output_path,
)


def preprocess_metadata(metadata: Dict) -> Dict:
# Replace null values with default values
processed_metadata = {
"id_": metadata.get("id_", ""),
"title": metadata.get("title", []) if metadata.get("title") is not None else [],
"author": metadata.get("author", [])
if metadata.get("author") is not None
else [],
"source": metadata.get("source", "")
if metadata.get("source") is not None
else "",
"language": metadata.get("language", "")
if metadata.get("language") is not None
else "",
"initial_creation_type": InitialCreationType(metadata["initial_creation_type"])
if "initial_creation_type" in metadata
else None,
"created_at": metadata.get("created_at"),
"source_metadata": metadata.get("source_metadata", {})
if metadata.get("source_metadata") is not None
else {},
}
return processed_metadata
Loading
Loading