Merge f5c6c7f into fd4ea02

OpenSemanticLab · Jun 20, 2024 · a1d9981 · a1d9981
2 parents fd4ea02 + f5c6c7f
commit a1d9981
Show file tree

Hide file tree

Showing 9 changed files with 538 additions and 404 deletions.
diff --git a/examples/data_processing/regex_matching.py b/examples/data_processing/regex_matching.py
@@ -0,0 +1,67 @@
+from osw.utils.regex_pattern import REGEX_PATTERN_LIB
+
+test_string = """
+    {{Template:Viewer/Media
+    | image_size = 300
+    | mode = default
+    | textdata = File:OSW420279d1be9640ad96e6685277a3f29b.png{{!}}Navigation menu;
+    File:OSW0024d7c0a0d64642bf51f5facfe85d42{{!}}Search bar;
+    File:OSW841b61b5996340fa81c3163bcea87482.png{{!}}Three points menu;
+    File:OSWe8c6b659eab14cca927835ccd6baef15.png{{!}}Wiki preferences;
+    File:OSWc0df6b35cc964307bbf3d18c78e5cb3e.png{{!}}Alerts;
+    File:OSWd173625534d04fe6aab90a7bee4008e2.png{{!}}Notices;
+    File:OSWc34ced55461949a59f950283e905b5fc.drawio.png{{!}}Personal menu;
+    }}
+    """
+
+second_test_string = r"""
+{
+  "type": [
+    "Category:OSW92cc6b1a2e6b4bb7bad470dfdcfdaf26"
+  ],
+  "author": [],
+  "uuid": "d00e7453-f1f3-4a6e-94e6-3c2664fb7776",
+  "label": [
+    {
+      "text": "Full knowledge graph",
+      "lang": "en"
+    }
+  ],
+  "description": [
+    {
+      "text": "Large graph displaying the full knowledge base",
+      "lang": "en"
+    }
+  ],
+  "name": "FullKnowledgeGraph",
+  "image": "File:OSWc34ced55461949a59f950283e905b5fc.drawio.png"
+}
+"""
+
+my_dict = {
+    "type": ["Category:OSW92cc6b1a2e6b4bb7bad470dfdcfdaf26"],
+    "author": [],
+    "uuid": "d00e7453-f1f3-4a6e-94e6-3c2664fb7776",
+    "label": [{"text": "Full knowledge graph", "lang": "en"}],
+    "description": [
+        {"text": "Large graph displaying the full knowledge base", "lang": "en"}
+    ],
+    "name": "FullKnowledgeGraph",
+    "image": "File:OSWc34ced55461949a59f950283e905b5fc.drawio.png",
+}
+
+my_dict_as_str = str(my_dict)
+
+
+# Test the regex pattern
+# Run the following code in the Python console / this script in interactive console
+my_pattern = REGEX_PATTERN_LIB["File page strings from any text"]
+
+search_result = my_pattern.search(test_string)
+
+findall_result = my_pattern.findall(test_string)
+
+full_page_names = my_pattern.findall_by_group_key(test_string, "Full page name")
+full_page_names_2 = my_pattern.findall_by_group_key(
+    second_test_string, "Full page name"
+)
diff --git a/scripts/migration/file_page_migration.py b/scripts/migration/file_page_migration.py
@@ -7,13 +7,13 @@
 from typing import Dict, List
 
 import mwclient
-from regex_pattern import REGEX_PATTERN_LIB, REGEX_PATTERN_LIST
 
 import osw.data.import_utility as iu
 import osw.wiki_tools as wt
 from osw.core import OSW
-from osw.data.mining import match_first_regex_pattern, test_regex_pattern
 from osw.model.entity import Label, WikiFile
+from osw.utils.regex_pattern import REGEX_PATTERN_LIB, REGEX_PATTERN_LIST
+from osw.utils.strings import match_first_regex_pattern, test_regex_pattern
 from osw.utils.util import parallelize
 from osw.wtsite import WtPage, WtSite
 

diff --git a/src/osw/controller/page_package.py b/src/osw/controller/page_package.py
@@ -11,10 +11,10 @@
 
 import osw.model.page_package as model
 from osw.auth import CredentialManager
-from osw.data.mining import RegExPatternExtended
 from osw.model import page_package as package
 from osw.model.page_package import NAMESPACE_CONST_TO_NAMESPACE_MAPPING
 from osw.model.static import OswBaseModel
+from osw.utils.strings import RegExPatternExtended
 from osw.wtsite import WtSite
 
 # Definition of constants

diff --git a/src/osw/data/import_utility.py b/src/osw/data/import_utility.py
@@ -10,72 +10,20 @@
 from geopy import Nominatim
 from jsonpath_ng import ext as jp
 
-import osw.data.mining as dm
+import osw.utils.strings as strutil
 from osw import wiki_tools as wt
 from osw.auth import CredentialManager
 from osw.core import OSW
-from osw.data.mining import RegExPatternExtended
 from osw.model import entity as model
+from osw.utils.regex_pattern import REGEX_PATTERN_LIB, REGEX_PATTERN_LIST
 from osw.wtsite import WtSite
 
 # Constants
 PACKAGE_ROOT_PATH = Path(__file__).parents[2]
 CREDENTIALS_FILE_PATH_DEFAULT = PACKAGE_ROOT_PATH / "examples" / "accounts.pwd.yaml"
 ENABLE_SORTING = True
-REGEX_PATTERN: Dict[str, Union[str, Dict[str, str]]] = {
-    "SAP OU number and name from DN": {
-        "Pattern": r"CN=(.+)([0-9]{10})-(.+),OU=Abteilungen",
-        "Groups": {2: "SAP OU number", 3: "SAP OU name"},
-    },
-    "Location name from DN": {
-        "Pattern": r"CN=[A-Za-z]+-(\d+)_L_([^_]+),OU=Standorte",
-        "Groups": {1: "SAP institute number", 2: "Location name"},
-    },
-    "Location/Site parts from DN": {
-        "Pattern": r"CN=[A-Za-z]+-(\d+)_L_(([^_^ ^-]+)-([^_^ ]+) (\d+)),OU=Standorte",
-        "Groups": {
-            1: "SAP institute number",
-            2: "Site name",
-            3: "City",
-            4: "Street",
-            5: "House number",
-        },
-    },
-    "UUID from full page title": {
-        "Pattern": r"([A-Za-z]+):([A-Z]+)([a-z\d\-]+)",
-        "Groups": {1: "Namespace", 2: "Prefix", 3: "UUID"},
-    },
-}
-REGEX_PATTERN_LIST = [
-    RegExPatternExtended(
-        description="SAP OU number and name from DN",
-        pattern=r"CN=(.+)([0-9]{10})-(.+),OU=Abteilungen",
-        group_keys=["Something", "SAP OU number", "SAP OU name"],
-    ),
-    RegExPatternExtended(
-        description="Location name from DN",
-        pattern=r"CN=[A-Za-z]+\-(\d+)_L_([^_]+),OU=Standorte",
-        group_keys=["SAP institute number", "Location name"],
-    ),
-    RegExPatternExtended(
-        description="Location/Site parts from DN",
-        pattern=r"CN=[A-Za-z]+\-(\d+)_L_(([^_^ ^-]+)-([^_^ ]+) (\d+))," r"OU=Standorte",
-        group_keys=[
-            "SAP institute number",
-            "Site name",
-            "City",
-            "Street",
-            "House number",
-        ],
-    ),
-    RegExPatternExtended(
-        description="UUID from full page title",
-        pattern=r"([A-Za-z]+):([A-Z]+)([a-z\d\-]+)",
-        group_keys=["Namespace", "Prefix", "UUID"],
-    ),
-]
+# For compatibility with the old version of the module
 REGEX_PATTERN = {rep.description: rep.dict() for rep in REGEX_PATTERN_LIST}
-REGEX_PATTERN_LIB = {rep.description: rep for rep in REGEX_PATTERN_LIST}
 
 
 # Classes
@@ -203,7 +151,7 @@ def get_uuid_from_object_via_type(obj: Any) -> Union[uuid_module.UUID, None]:
         else:
             type_str = str(type_)
         match = re.match(
-            pattern=REGEX_PATTERN["UUID from full page title"]["Pattern"],
+            pattern=REGEX_PATTERN_LIB["UUID from full page title"].pattern,
             string=type_str,
         )
         uuid_str = match.group(3)
@@ -473,8 +421,8 @@ def nan_empty_or_none(inp: Any) -> bool:
 
 
 def regex_match_list(
-    pattern: Union[str, dm.RegExPatternExtended], list_of_strings: List[str]
-) -> List[Union[str, dm.MatchResult]]:
+    pattern: Union[str, strutil.RegExPatternExtended], list_of_strings: List[str]
+) -> List[Union[str, strutil.MatchResult]]:
     """Returns a subset of the 'list_of_strings' that matched the regex 'pattern'.
 
     Parameters
@@ -493,7 +441,7 @@ def regex_match_list(
             if re.match(pattern=pattern, string=string):
                 matches.append(string)
         return matches
-    elif isinstance(pattern, dm.RegExPatternExtended):
+    elif isinstance(pattern, strutil.RegExPatternExtended):
         matches = []
         for string in list_of_strings:
             match_result_obj = pattern.match(string)
@@ -780,6 +728,30 @@ def get_entities_from_osw(
     return entities_from_osw
 
 
+def full_page_title_to_uuid(full_page_title: str) -> uuid_module.UUID:
+    """Extracts a UUID from a full page title."""
+    match = re.match(
+        pattern=REGEX_PATTERN_LIB["UUID from full page title"].pattern,
+        string=full_page_title,
+    )
+    uuid_str = match.group(3)
+    return uuid_module.UUID(uuid_str)
+
+
+def osw_id_to_uuid(osw_id: str) -> uuid_module.UUID:
+    """Extracts a UUID from an OSW ID."""
+    match = re.match(
+        pattern=REGEX_PATTERN_LIB["UUID from OSW ID"].pattern, string=osw_id
+    )
+    uuid_str = match.group(2)
+    return uuid_module.UUID(uuid_str)
+
+
+def uuid_to_osw_id(uuid: uuid_module.UUID, prefix: str = "OSW") -> str:
+    """Creates an OSW ID from a UUID."""
+    return f"{prefix}{str(uuid).replace('-', '')}"
+
+
 def uuid_to_full_page_title(
     uuid: Union[uuid_module.UUID, str],
     wiki_ns: str = "Item",