In [1]:
import os
import os.path
import re
from typing import Tuple, Optional

HERE = os.path.dirname(os.path.abspath("__file__"))
PROJECT_DIR = os.path.abspath(os.path.join(HERE, "../.."))

In [17]:
DNA_SAMPLE_DIR = os.path.join(PROJECT_DIR, 'benchmark/crossings/dna/small-sample')
DNA_OUTPUT_DIR = os.path.join(PROJECT_DIR, 'benchmark/crossings/dna/small-sample-new')

In [19]:
def parse_regex(rgx: str) -> str:
  res = []
  for m in re.finditer("(([A-Z])|\[(\^)?([A-Z]+)\]|(\.))(?:\{(\d+)(?:,(\d+))?\})?", rgx):
    if m:
      res.append((m.group(0), (m.group(1), m.group(2), (m.group(3), m.group(4)), m.group(5)), (m.group(6), m.group(7))))
  
  return res

def atom_complement(atom, parens=True) -> str:
  orig, root, rep = atom
  rr, single, cc, wcard = root
  neg, ccls = cc
  first, second = rep
  if single:
    ret = f"[^{single}]"
  elif ccls:
    if neg:
      ret = f"[{ccls}]"
    else:
      ret = f"[^{ccls}]"
  else:
    ret = ""

  if first and not wcard:
    comp = ret
    for i in range(1,int(first)):
      ret += f'|{rr*i}{comp}'
    if parens:
      ret = f"(?:{ret})"
    else:
      ret = f"({ret})"
    


  return ret

def complement(rgx: str, parens=True) -> str:
  ret = []
  last = ""
  for e in parse_regex(rgx):
    atom, root, rep = e
    if not root[3]: 
      comp = atom_complement(e, parens)
      ret.append(f"{last}{comp}")
    last += atom
  return "|".join(ret)

In [20]:
def parse_perl(rgx: str) -> str:
  pass

def parse_rematch(rgx: str) -> str:
  m1 = re.fullmatch("!x\{(.+)\}", rgx)
  if m1:
    non_capturing_regex = m1.group(1)
    m2 = re.fullmatch("(.+)\.\{0,100\}(.+)", non_capturing_regex)
    if m2:
      return f"!x{{{m2.group(1)}({complement(m2.group(2), False)})*{m2.group(2)}}}"
      # print(complement(m2.group(2)))

In [26]:
for root, dirs, files in os.walk(DNA_SAMPLE_DIR):
  for i, f in enumerate(files):
    if f in ["perl.rgx", "boost.rgx"]:
      with open(os.path.join(root, f)) as fp:
        res = parse_perl(fp.read())
    elif f == "rematch.rgx":
      with open(os.path.join(root, f)) as fp:
        rgx = fp.read()
        if '?' not in rgx:
          os.makedirs(os.path.join(DNA_OUTPUT_DIR, f"exp{i:03}"), exist_ok=True)
          with open(os.path.join(DNA_OUTPUT_DIR, f"exp{i:03}/rematch.rgx"), 'w') as wp:
            wp.write(parse_rematch(rgx))