Skip to content

Commit

Permalink
feat: add parallel load_entity
Browse files Browse the repository at this point in the history
  • Loading branch information
simontaurus committed Oct 29, 2023
1 parent 82b1830 commit 7edd4ed
Show file tree
Hide file tree
Showing 3 changed files with 113 additions and 43 deletions.
33 changes: 26 additions & 7 deletions examples/load_entity.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os

import osw.model.entity as model
from osw.auth import CredentialManager
from osw.core import OSW
from osw.wtsite import WtSite

Expand All @@ -9,18 +10,36 @@
os.path.dirname(os.path.abspath(__file__)), "accounts.pwd.yaml"
)
# pwd_file_path = "./accounts.pwd.yaml"
wtsite = WtSite.from_domain("wiki-dev.open-semantic-lab.org", pwd_file_path)
wtsite = WtSite(
config=WtSite.WtSiteConfig(
iri="wiki-dev.open-semantic-lab.org",
cred_mngr=CredentialManager(cred_filepath=pwd_file_path),
)
)
osw = OSW(site=wtsite)

title = "Item:OSW7d7193567ea14e4e89b74de88983b718"
# title = "Item:OSWe02213b6c4664d04834355dc8eb08b99"
# Load Tutorial Schema on demand
if not hasattr(model, "Tutorial"):
osw.fetch_schema(
OSW.FetchSchemaParam(
schema_title="Category:OSW494f660e6a714a1a9681c517bbb975da", mode="replace"
)
)

# load instance HowToCreateANewArticle
title = "Item:OSW52c2c5a6bbc84fcb8eab0fa69857e7dc"
entity = osw.load_entity(title)
print(entity.__class__)
print(entity.label[0].text) # we can access any attribute of model.Entity

hardware_entity = entity.cast(model.Hardware) # explicit cast to model.Hardware
print(hardware_entity.manufacturer) # we can access now any attribute of model.Entity
tutorial_entity = entity.cast(model.Tutorial) # explicit cast to model.Tutorial
print(
tutorial_entity.required_predecessor
) # we can access now any attribute of model.Tutorial

print(entity.manufacturer)
print(entity.json(exclude_none=True)) # export as json

print(entity.json(exclude_none=True))
# load all instances of Tutorial in parallel
tutorials = osw.query_instances(category="Category:OSW494f660e6a714a1a9681c517bbb975da")
print(tutorials)
osw.load_entity(tutorials)
113 changes: 81 additions & 32 deletions src/osw/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -556,7 +556,31 @@ def _fetch_schema(self, fetchSchemaParam: _FetchSchemaParam = None) -> None:
if not site_cache_state:
self.site.disable_cache() # restore original state

def load_entity(self, entity_title) -> model.Entity:
class LoadEntityParam(BaseModel):
"""Param for load_entity()
Attributes
----------
titles:
one or multiple titles (wiki page name) of entities
"""

titles: Union[str, List[str]]

class LoadEntityResult(BaseModel):
"""Result of load_entity()
Attributes
----------
entities:
the dataclass instance(s)
"""

entities: Union[model.Entity, List[model.Entity]]

def load_entity(
self, entity_title: Union[str, List[str], LoadEntityParam]
) -> Union[model.Entity, List[model.Entity], LoadEntityResult]:
"""Loads the entity with the given wiki page name from the OSW instance.
Creates an instance of the class specified by the "type" attribute, default
model.Entity. An instance of model.Entity can be cast to any subclass with
Expand All @@ -569,42 +593,67 @@ def load_entity(self, entity_title) -> model.Entity:
Returns
-------
the dataclass instance
the dataclass instance if only a single title is given
a list of dataclass instances if a list of titles is given
a LoadEntityResult instance if a LoadEntityParam is given
"""
entity = None
schemas = []
page = self.site.get_page(WtSite.GetPageParam(titles=[entity_title])).pages[0]
jsondata = page.get_slot_content("jsondata")
if jsondata:
for category in jsondata["type"]:
schema = (
self.site.get_page(WtSite.GetPageParam(titles=[category]))
.pages[0]
.get_slot_content("jsonschema")
)
schemas.append(schema)

if len(schemas) == 0:
print("Error: no schema defined")
titles = []
if isinstance(entity_title, str): # single title
titles = [entity_title]
if isinstance(entity_title, list): # list of titles
titles = entity_title
if isinstance(entity_title, OSW.LoadEntityParam): # LoadEntityParam
titles = entity_title.titles
entities = []

# store original cache state
cache_state = self.site.get_cache_enabled()
# enable cache to speed up loading
self.site.enable_cache()
pages = self.site.get_page(WtSite.GetPageParam(titles=titles)).pages
for page in pages:
entity = None
schemas = []
jsondata = page.get_slot_content("jsondata")
if jsondata:
for category in jsondata["type"]:
schema = (
self.site.get_page(WtSite.GetPageParam(titles=[category]))
.pages[0]
.get_slot_content("jsonschema")
)
schemas.append(schema)

elif len(schemas) == 1:
cls = schemas[0]["title"]
entity = eval(f"model.{cls}(**jsondata)")
if len(schemas) == 0:
print("Error: no schema defined")

else:
bases = []
for schema in schemas:
bases.append(eval("model." + schema["title"]))
cls = create_model("Test", __base__=tuple(bases))
entity = cls(**jsondata)
entity.meta = model.Meta(
wiki_page=model.WikiPage(
namespace=namespace_from_full_title(entity_title),
title=title_from_full_title(entity_title),
)
)
elif len(schemas) == 1:
cls = schemas[0]["title"]
entity = eval(f"model.{cls}(**jsondata)")

return entity
else:
bases = []
for schema in schemas:
bases.append(eval("model." + schema["title"]))
cls = create_model("Test", __base__=tuple(bases))
entity = cls(**jsondata)
entity.meta = model.Meta(
wiki_page=model.WikiPage(
namespace=namespace_from_full_title(entity_title),
title=title_from_full_title(entity_title),
)
)
entities.append(entity)
# restore original cache state
if not cache_state:
self.site.disable_cache()
if isinstance(entity_title, str): # single title
return entities[0]
if isinstance(entity_title, list): # list of titles
return entities
if isinstance(entity_title, OSW.LoadEntityParam): # LoadEntityParam
return OSW.LoadEntityResult(entities=entities)

class StoreEntityParam(model.OswBaseModel):
entities: Union[OswBaseModel, List[OswBaseModel]]
Expand Down
10 changes: 6 additions & 4 deletions src/osw/wtsite.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,18 +216,20 @@ def get_page(self, param: GetPageParam) -> GetPageResult:
exeptions = []
pages = []

def get_page_(title, index):
def get_page_(title: str, index: int = None):
retry = 0
wtpage = None
while retry < param.retries:
msg = ""
if index is not None:
msg = f"({index + 1}/{max_index}) "
try:
if self._cache_enabled and title in self._page_cache:
wtpage = self._page_cache[title]
msg += f"({index + 1}/{max_index}): Page loaded from cache."
msg += "Page loaded from cache. "
else:
wtpage = WtPage(self, title)
msg += f"({index + 1}/{max_index}): Page loaded"
msg += "Page loaded. "
if self._cache_enabled:
self._page_cache[title] = wtpage
pages.append(wtpage)
Expand All @@ -237,7 +239,7 @@ def get_page_(title, index):
msg += str(e)
if retry < param.retries:
retry += 1
msg = f"({index + 1}/{max_index}): Page load failed. Retry ({retry}/{param.retries})"
msg = f"Page load failed. Retry ({retry}/{param.retries}). "
sleep(5)
print(msg)
self._clear_cookies()
Expand Down

0 comments on commit 7edd4ed

Please sign in to comment.