In [1]:
import os
import os.path

PROJECT_DIR = '/home/nicovsj/repos/research/REmatch-org/REmatch'
ENGLISH = os.path.join(PROJECT_DIR, 'datasets/english-works/english-authors.txt')
SPARQL = os.path.join(PROJECT_DIR, 'datasets/RKBExplorer/sparql.log.1')

A function to return a list of line-separated regexes defined in a file.

In [2]:
def get_regexes(path):
  regexps = []
  with open(path) as fp:
    for line in fp:
      regexps.append(line.strip())
  return regexps

Now, get every morpheme defined in the datasets directory

In [42]:
suff = get_regexes(os.path.join(PROJECT_DIR, 'datasets/morphemes/suff.txt'))
pref = get_regexes(os.path.join(PROJECT_DIR, 'datasets/morphemes/pref.txt'))
greek = get_regexes(os.path.join(PROJECT_DIR, 'datasets/morphemes/greek.txt'))
latin = get_regexes(os.path.join(PROJECT_DIR, 'datasets/morphemes/latin.txt'))

In [43]:
regexps = dict()

regexps['suff'] = map(lambda w: {'rem': f'\W!x{{\w+{w}}}\W', 'perl': f'\W(\w+{w})\W'}, suff)
regexps['pref'] = map(lambda w: {'rem': f'\W!x{{{w}\w+}}\W', 'perl': f'\W({w}\w+)\W'}, pref)
regexps['greek'] = map(lambda w: {'rem': f'\W!x{{\w+{w}\w+}}\W', 'perl': f'\W(\w+{w}\w+)\W'}, greek)
regexps['latin'] = map(lambda w: {'rem': f'\W!x{{\w+{w}\w+}}\W', 'perl': f'\W(\w+{w}\w+)\W'}, latin)

In [44]:
for key, value in regexps.items():
  categ_path = os.path.join(PROJECT_DIR, f'exp/morphemes/{key}')
  os.makedirs(categ_path, exist_ok=True)
  for i, rgxs in enumerate(value):
    exp_path = os.path.join(categ_path, f'exp{i:02}')
    os.makedirs(exp_path, exist_ok=True)
    if(os.path.exists(os.path.join(exp_path, 'doc.txt'))):
      os.remove(os.path.join(exp_path, 'doc.txt'))
    os.symlink(ENGLISH, os.path.join(exp_path, 'doc.txt'))
    with open(os.path.join(exp_path, 'rematch.rgx'), 'w') as fp:
      fp.write(rgxs['rem'])
    with open(os.path.join(exp_path, 'perl.rgx'), 'w') as fp:
      fp.write(rgxs['perl'])
    with open(os.path.join(exp_path, 'boost.rgx'), 'w') as fp:
      fp.write(rgxs['perl'])

In [4]:
ops = get_regexes(os.path.join(PROJECT_DIR, 'datasets/sparql/operators.txt'))

In [5]:
tot_ops = dict()
tot_ops['single'] = []
for op in ops:
  tot_ops['single'].append({'rem': f'\\n!x{{[^\\n]* {op} [^\\n]*}}\\n',
                              'perl': f'^([^\\n]* {op} [^\\n]*)$'})
tot_ops['double'] = []

double_ops = (('OPTIONAL',  'OPTIONAL'),
              ('OPTIONAL',  'FILTER'),
              ('OPTIONAL',  'LIMIT'),
              ('OPTIONAL',  'ASK'),
              ('CONSTRUCT', 'AS'),
              ('CONSTUCT',  'GROUP BY'),
              ('OPTIONAL',  'GROUP BY'),
              ('AS',        'FILTER'),
              ('AS',        'WHERE'),
              ('OPTIONAL',  'ORDER BY'),
              ('AS',        'SELECT'),
              ('AS',        'LIMIT'),
              ('AS',        'AS'),
              ('AS',        'GROUP BY'),
              ('AS',        'ORDER BY'),
              ('FILTER',    'FILTER'),
              ('GROUP BY',  'LIMIT'),
              ('GROUP BY',  'ORDER BY'),
              ('ORDER BY',  'LIMIT'),
              ('FILTER',    'LIMIT'),
              ('FILTER',    'GROUP BY'),
              ('WHERE',     'OPTIONAL'),
              ('WHERE',     'FILTER'),
              ('WHERE',     'WHERE'),
              ('WHERE',     'SELECT'),
              ('WHERE',     'LIMIT'),
              ('WHERE',     'AS'),
              ('WHERE',     'GROUP BY'),
              ('WHERE',     'ORDER BY'),
              ('PREFIX',    'OPTIONAL'),
              ('PREFIX',    'FILTER'),
              ('PREFIX',    'WHERE'),
              ('PREFIX',    'PREFIX'),
              ('PREFIX',    'SELECT'),
              ('PREFIX',    'DISTICT'),
              ('PREFIX',    'LIMIT'),
              ('PREFIX',    'CONSTRUCT'),
              ('PREFIX',    'AS'),
              ('PREFIX',    'GROUP BY'),
              ('PREFIX',    'ORDER BY'),
              ('SELECT',    'OPTIONAL'),
              ('SELECT',    'FILTER'),
              ('SELECT',    'WHERE'),
              ('SELECT',    'SELECT'),
              ('SELECT',    'LIMIT'),
              ('SELECT',    'AS'),
              ('SELECT',    'GROUP BY'),
              ('SELECT',    'ORDER BY'),
              ('DISTINCT',  'OPTIONAL'),
              ('DISTINCT',  'FILTER'),
              ('DISTINCT',  'WHERE'),
              ('DISTINCT',   'LIMIT'),
              ('DISTINCT',  'GROUP BY'),
              ('DISTICT',   'ORDER BY'),
              ('CONSTRUCT', 'OPTIONAL'),
              ('CONSTRUCT', 'FILTER'),
              ('CONSTRUCT', 'WHERE'),
              ('CONSTRUCT', 'SELECT'),
              ('CONSTRUCT', 'LIMIT')
              )

for op1, op2 in double_ops:
  tot_ops['double'].append({'rem': f'\\n!x{{[^\\n]* {op1} [^\\n]* {op2} [^\\n]*}}\\n',
                            'perl': f'^([^\\n]* {op1} [^\\n]* {op2} [^\\n]*)$'})


In [6]:
for key, value in tot_ops.items():
  categ_path = os.path.join(PROJECT_DIR, f'exp/sparql/{key}')
  os.makedirs(categ_path, exist_ok=True)
  for i, rgxs in enumerate(value):
    exp_path = os.path.join(categ_path, f'exp{i:02}')
    os.makedirs(exp_path, exist_ok=True)
    if(os.path.exists(os.path.join(exp_path, 'doc.txt'))):
      os.remove(os.path.join(exp_path, 'doc.txt'))
    os.symlink(SPARQL, os.path.join(exp_path, 'doc.txt'))
    with open(os.path.join(exp_path, 'rematch.rgx'), 'w') as fp:
      fp.write(rgxs['rem'])
    with open(os.path.join(exp_path, 'perl.rgx'), 'w') as fp:
      fp.write('(?m)' + rgxs['perl'])
    with open(os.path.join(exp_path, 'boost.rgx'), 'w') as fp:
      fp.write(rgxs['perl'])