In [2]:
import os
import os.path as path
from pprint import pprint
import re
import json

HERE = path.dirname(path.abspath("__file___"))
PROJECT_DIR = path.abspath(path.join(HERE, "../.."))

DATASETS = {
    "dna": {
        "exp-path": path.join(PROJECT_DIR, "exp/crossings/dna/1var"),
        "exp-path-2": path.join(PROJECT_DIR, "exp/crossings/dna/2vars"),
        "exp-path-3": path.join(PROJECT_DIR, "exp/crossings/dna/lookahead"),
        "2-variable-metaregexes": [
            {
                "rematch": R"!x{>NP[^\n]+}\n[A-Z\n]*!y{&.{0,20}&}[A-Z\n]*\n>",
                "re2": R"(>NP[^\n]+)\n[A-Z\n]*(&.{0,20}&)[A-Z\n]*\n",
            }
        ],
        "lookahead-metaregex": [
            {"rematch": R"!x{&.{0,20}&}", "pcre": R"(?=(&.{0,20}&)).", "re2": R"(&.{0,20}&)"}
        ],
    },
    "morphemes": {
        "exp-path": path.join(PROJECT_DIR, "exp/crossings/morphemes/2grams"),
        "exp-path-2": path.join(PROJECT_DIR, "exp/crossings/morphemes/2vars"),
        "exp-path-3": path.join(PROJECT_DIR, "exp/crossings/morphemes/lookahead"),
        "splitting-regex": R"^\\W!x\{(.*?)\\W(.*?)\}\\W$",
        "2-variable-metaregexes": [
            {
                "rematch": R"(\n\n|\.\s*)!x{[A-Z][^.]* !y{& &}( [^.]*)?\.}",
                "re2": R"(?:\n\n|\.\s*)([A-Z][^.]* (& &)(?: [^.]*)?\.)",
            }
        ],
        "lookahead-metaregex": [{"rematch": R"\W!x{&\W&}\W","pcre": R".(?=\W(&\W&)\W)", "re2": R"\W(&\W&)\W"}],
    },
    "sparql": {
        "exp-path": path.join(PROJECT_DIR, "exp/crossings/sparql/1-var/2lines"),
        "exp-path-3": path.join(PROJECT_DIR, "exp/crossings/sparql/1-var/lookahead"),
        "splitting-regex": R"^!x\{\\n(.*\[\^\\n\]\*)\\n(\[\^\\n\]\*.*)\\n\}$",
        "lookahead-metaregex": [{"rematch": R"\n!x{&\n&}\n","pcre": R"(?=(\n&\n&))\n", "re2": R"\n(&\n&)\n"}],
    },
}


def get_subdirs(parent_dir: str):
    for f in os.scandir(parent_dir):
        if f.is_dir():
            yield f.path

In [3]:
dataset = 'morphemes'
with open(path.join(DATASETS[dataset]['exp-path'], 'cross_product.json')) as fp:
  cross_product = json.loads(fp.read())

for k in cross_product:
  path1 = path.join(DATASETS[dataset]['exp-path-3'], f'sample/k{k}')
  for i, (pattern1, pattern2) in enumerate(cross_product[k]):
    path2 = path.join(path1, f'exp{i:03}')
    os.makedirs(path2, exist_ok=True)
    for mregex in DATASETS[dataset]['lookahead-metaregex']:
      for libtype in mregex:
        res = re.sub('&', lambda x: pattern1, mregex[libtype], count=1)
        res = re.sub('&', lambda x: pattern2, res, count=1)
        with open(path.join(path2, f"{libtype}.rgx"), 'w') as fp:
          fp.write(res)

      if not path.exists(path.join(path2, f"perl.rgx")):
        os.symlink(path.join(path2, f"pcre.rgx"), path.join(path2, f"perl.rgx"))
      if not path.exists(path.join(path2, f"boost.rgx")):
        os.symlink(path.join(path2, f"pcre.rgx"), path.join(path2, f"boost.rgx"))
