Skip to content

Commit

Permalink
Merge f5c6c7f into fd4ea02
Browse files Browse the repository at this point in the history
  • Loading branch information
LukasGold committed Jun 20, 2024
2 parents fd4ea02 + f5c6c7f commit a1d9981
Show file tree
Hide file tree
Showing 9 changed files with 538 additions and 404 deletions.
67 changes: 67 additions & 0 deletions examples/data_processing/regex_matching.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
from osw.utils.regex_pattern import REGEX_PATTERN_LIB

test_string = """
{{Template:Viewer/Media
| image_size = 300
| mode = default
| textdata = File:OSW420279d1be9640ad96e6685277a3f29b.png{{!}}Navigation menu;
File:OSW0024d7c0a0d64642bf51f5facfe85d42{{!}}Search bar;
File:OSW841b61b5996340fa81c3163bcea87482.png{{!}}Three points menu;
File:OSWe8c6b659eab14cca927835ccd6baef15.png{{!}}Wiki preferences;
File:OSWc0df6b35cc964307bbf3d18c78e5cb3e.png{{!}}Alerts;
File:OSWd173625534d04fe6aab90a7bee4008e2.png{{!}}Notices;
File:OSWc34ced55461949a59f950283e905b5fc.drawio.png{{!}}Personal menu;
}}
"""

second_test_string = r"""
{
"type": [
"Category:OSW92cc6b1a2e6b4bb7bad470dfdcfdaf26"
],
"author": [],
"uuid": "d00e7453-f1f3-4a6e-94e6-3c2664fb7776",
"label": [
{
"text": "Full knowledge graph",
"lang": "en"
}
],
"description": [
{
"text": "Large graph displaying the full knowledge base",
"lang": "en"
}
],
"name": "FullKnowledgeGraph",
"image": "File:OSWc34ced55461949a59f950283e905b5fc.drawio.png"
}
"""

my_dict = {
"type": ["Category:OSW92cc6b1a2e6b4bb7bad470dfdcfdaf26"],
"author": [],
"uuid": "d00e7453-f1f3-4a6e-94e6-3c2664fb7776",
"label": [{"text": "Full knowledge graph", "lang": "en"}],
"description": [
{"text": "Large graph displaying the full knowledge base", "lang": "en"}
],
"name": "FullKnowledgeGraph",
"image": "File:OSWc34ced55461949a59f950283e905b5fc.drawio.png",
}

my_dict_as_str = str(my_dict)


# Test the regex pattern
# Run the following code in the Python console / this script in interactive console
my_pattern = REGEX_PATTERN_LIB["File page strings from any text"]

search_result = my_pattern.search(test_string)

findall_result = my_pattern.findall(test_string)

full_page_names = my_pattern.findall_by_group_key(test_string, "Full page name")
full_page_names_2 = my_pattern.findall_by_group_key(
second_test_string, "Full page name"
)
4 changes: 2 additions & 2 deletions scripts/migration/file_page_migration.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,13 @@
from typing import Dict, List

import mwclient
from regex_pattern import REGEX_PATTERN_LIB, REGEX_PATTERN_LIST

import osw.data.import_utility as iu
import osw.wiki_tools as wt
from osw.core import OSW
from osw.data.mining import match_first_regex_pattern, test_regex_pattern
from osw.model.entity import Label, WikiFile
from osw.utils.regex_pattern import REGEX_PATTERN_LIB, REGEX_PATTERN_LIST
from osw.utils.strings import match_first_regex_pattern, test_regex_pattern
from osw.utils.util import parallelize
from osw.wtsite import WtPage, WtSite

Expand Down
2 changes: 1 addition & 1 deletion src/osw/controller/page_package.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@

import osw.model.page_package as model
from osw.auth import CredentialManager
from osw.data.mining import RegExPatternExtended
from osw.model import page_package as package
from osw.model.page_package import NAMESPACE_CONST_TO_NAMESPACE_MAPPING
from osw.model.static import OswBaseModel
from osw.utils.strings import RegExPatternExtended
from osw.wtsite import WtSite

# Definition of constants
Expand Down
90 changes: 31 additions & 59 deletions src/osw/data/import_utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,72 +10,20 @@
from geopy import Nominatim
from jsonpath_ng import ext as jp

import osw.data.mining as dm
import osw.utils.strings as strutil
from osw import wiki_tools as wt
from osw.auth import CredentialManager
from osw.core import OSW
from osw.data.mining import RegExPatternExtended
from osw.model import entity as model
from osw.utils.regex_pattern import REGEX_PATTERN_LIB, REGEX_PATTERN_LIST
from osw.wtsite import WtSite

# Constants
PACKAGE_ROOT_PATH = Path(__file__).parents[2]
CREDENTIALS_FILE_PATH_DEFAULT = PACKAGE_ROOT_PATH / "examples" / "accounts.pwd.yaml"
ENABLE_SORTING = True
REGEX_PATTERN: Dict[str, Union[str, Dict[str, str]]] = {
"SAP OU number and name from DN": {
"Pattern": r"CN=(.+)([0-9]{10})-(.+),OU=Abteilungen",
"Groups": {2: "SAP OU number", 3: "SAP OU name"},
},
"Location name from DN": {
"Pattern": r"CN=[A-Za-z]+-(\d+)_L_([^_]+),OU=Standorte",
"Groups": {1: "SAP institute number", 2: "Location name"},
},
"Location/Site parts from DN": {
"Pattern": r"CN=[A-Za-z]+-(\d+)_L_(([^_^ ^-]+)-([^_^ ]+) (\d+)),OU=Standorte",
"Groups": {
1: "SAP institute number",
2: "Site name",
3: "City",
4: "Street",
5: "House number",
},
},
"UUID from full page title": {
"Pattern": r"([A-Za-z]+):([A-Z]+)([a-z\d\-]+)",
"Groups": {1: "Namespace", 2: "Prefix", 3: "UUID"},
},
}
REGEX_PATTERN_LIST = [
RegExPatternExtended(
description="SAP OU number and name from DN",
pattern=r"CN=(.+)([0-9]{10})-(.+),OU=Abteilungen",
group_keys=["Something", "SAP OU number", "SAP OU name"],
),
RegExPatternExtended(
description="Location name from DN",
pattern=r"CN=[A-Za-z]+\-(\d+)_L_([^_]+),OU=Standorte",
group_keys=["SAP institute number", "Location name"],
),
RegExPatternExtended(
description="Location/Site parts from DN",
pattern=r"CN=[A-Za-z]+\-(\d+)_L_(([^_^ ^-]+)-([^_^ ]+) (\d+))," r"OU=Standorte",
group_keys=[
"SAP institute number",
"Site name",
"City",
"Street",
"House number",
],
),
RegExPatternExtended(
description="UUID from full page title",
pattern=r"([A-Za-z]+):([A-Z]+)([a-z\d\-]+)",
group_keys=["Namespace", "Prefix", "UUID"],
),
]
# For compatibility with the old version of the module
REGEX_PATTERN = {rep.description: rep.dict() for rep in REGEX_PATTERN_LIST}
REGEX_PATTERN_LIB = {rep.description: rep for rep in REGEX_PATTERN_LIST}


# Classes
Expand Down Expand Up @@ -203,7 +151,7 @@ def get_uuid_from_object_via_type(obj: Any) -> Union[uuid_module.UUID, None]:
else:
type_str = str(type_)
match = re.match(
pattern=REGEX_PATTERN["UUID from full page title"]["Pattern"],
pattern=REGEX_PATTERN_LIB["UUID from full page title"].pattern,
string=type_str,
)
uuid_str = match.group(3)
Expand Down Expand Up @@ -473,8 +421,8 @@ def nan_empty_or_none(inp: Any) -> bool:


def regex_match_list(
pattern: Union[str, dm.RegExPatternExtended], list_of_strings: List[str]
) -> List[Union[str, dm.MatchResult]]:
pattern: Union[str, strutil.RegExPatternExtended], list_of_strings: List[str]
) -> List[Union[str, strutil.MatchResult]]:
"""Returns a subset of the 'list_of_strings' that matched the regex 'pattern'.
Parameters
Expand All @@ -493,7 +441,7 @@ def regex_match_list(
if re.match(pattern=pattern, string=string):
matches.append(string)
return matches
elif isinstance(pattern, dm.RegExPatternExtended):
elif isinstance(pattern, strutil.RegExPatternExtended):
matches = []
for string in list_of_strings:
match_result_obj = pattern.match(string)
Expand Down Expand Up @@ -780,6 +728,30 @@ def get_entities_from_osw(
return entities_from_osw


def full_page_title_to_uuid(full_page_title: str) -> uuid_module.UUID:
"""Extracts a UUID from a full page title."""
match = re.match(
pattern=REGEX_PATTERN_LIB["UUID from full page title"].pattern,
string=full_page_title,
)
uuid_str = match.group(3)
return uuid_module.UUID(uuid_str)


def osw_id_to_uuid(osw_id: str) -> uuid_module.UUID:
"""Extracts a UUID from an OSW ID."""
match = re.match(
pattern=REGEX_PATTERN_LIB["UUID from OSW ID"].pattern, string=osw_id
)
uuid_str = match.group(2)
return uuid_module.UUID(uuid_str)


def uuid_to_osw_id(uuid: uuid_module.UUID, prefix: str = "OSW") -> str:
"""Creates an OSW ID from a UUID."""
return f"{prefix}{str(uuid).replace('-', '')}"


def uuid_to_full_page_title(
uuid: Union[uuid_module.UUID, str],
wiki_ns: str = "Item",
Expand Down
Loading

0 comments on commit a1d9981

Please sign in to comment.