In [1]:
import os
import os.path
import csv
import pandas as pd
import re

PROJECT_DIR = '/home/nicovsj/repos/research/REmatch-org/REmatch'
ENGLISH = os.path.join(PROJECT_DIR, 'datasets/english-works/english-authors.txt')
SPARQL = os.path.join(PROJECT_DIR, 'datasets/RKBExplorer/sparql.log.1')
DNA = os.path.join(PROJECT_DIR, 'datasets/dna/proteoms/zebrafish.faa')

HERE = os.path.dirname(os.path.abspath('__file__'))

A function to return a list of line-separated regexes defined in a file.

In [2]:
def get_regexes(path):
  regexps = []
  with open(path) as fp:
    for line in fp:
      regexps.append(line.strip())
  return regexps

In [5]:
df = pd.read_csv(os.path.join(HERE, 'dna-sample.csv'), index_col=[0])

In [4]:
for i, row in df.iterrows():
  exp_path = os.path.join(PROJECT_DIR, 'exp/crossings/dna/sample/' \
                                    f'k{row["bin_range_max"]}/exp{row.name:03}')
  os.makedirs(exp_path, exist_ok=True)
  if(os.path.exists(os.path.join(exp_path, 'doc.txt'))):
    os.remove(os.path.join(exp_path, 'doc.txt'))
  os.symlink(DNA, os.path.join(exp_path, 'doc.txt'))
  with open(os.path.join(exp_path, 'rematch.rgx'), 'w') as fp:
    fp.write(f'!x{{{row["query"]}}}')
  with open(os.path.join(exp_path, 'perl.rgx'), 'w') as fp:
    fp.write(f'({row["query"]})')
  with open(os.path.join(exp_path, 'boost.rgx'), 'w') as fp:
    fp.write(f'({row["query"]})')

Now, get every morpheme defined in the datasets directory

In [3]:
suff = get_regexes(os.path.join(PROJECT_DIR, 'datasets/morphemes/suff.txt'))
pref = get_regexes(os.path.join(PROJECT_DIR, 'datasets/morphemes/pref.txt'))
greek = get_regexes(os.path.join(PROJECT_DIR, 'datasets/morphemes/greek.txt'))
latin = get_regexes(os.path.join(PROJECT_DIR, 'datasets/morphemes/latin.txt'))

In [12]:
import pandas as pd

df = pd.read_csv(os.path.join(HERE, 'morphemes.csv'))

for i, row in df.iterrows():
  if row['type'] == 'suffix':
    df.at[i, 'query'] = f'\w+{row["query"]}'
  elif row['type'] == 'prefix':
    df.at[i,'query'] = f'{row["query"]}\w+'
  else:
    df.at[i,'query'] = f'\w+{row["query"]}\w+'

df = df.sort_values(by=['type'])
df = df.reset_index()

In [14]:
for i, row in df.iterrows():
  exp_path = os.path.join(PROJECT_DIR, f'exp/morphemes/{row["type"]}/exp{i:02}')
  os.makedirs(exp_path, exist_ok=True)
  if(os.path.exists(os.path.join(exp_path, 'doc.txt'))):
    os.remove(os.path.join(exp_path, 'doc.txt'))
  os.symlink(ENGLISH, os.path.join(exp_path, 'doc.txt'))
  with open(os.path.join(exp_path, 'rematch.rgx'), 'w') as fp:
    fp.write(f'\W!x{{{row["query"]}}}\W')
  with open(os.path.join(exp_path, 'perl.rgx'), 'w') as fp:
    fp.write(f'\W({row["query"]})\W')
  with open(os.path.join(exp_path, 'boost.rgx'), 'w') as fp:
    fp.write(f'\W({row["query"]})\W')

In [4]:
ops = get_regexes(os.path.join(PROJECT_DIR, 'datasets/sparql/operators.txt'))

In [15]:
tot_ops = []
for op in ops:
  tot_ops.append(f'[^\\n]* {op} [^\\n]*')

double_ops = []
for op1 in ops:
  for op2 in ops:
    double_ops.append(f'[^\\n]* {op1} [^\\n]* {op2} [^\\n]*')


len(double_ops)

169

In [20]:
df = pd.read_csv(os.path.join(HERE, 'ops-filtered.csv'), index_col=[0])
df

Unnamed: 0,query,noutputs
0,[^\n]* WHERE [^\n]*,116659
1,[^\n]* SELECT [^\n]* WHERE [^\n]*,105747
2,[^\n]* SELECT [^\n]*,105747
3,[^\n]* SELECT [^\n]* WHERE [^\n]*,105747
4,[^\n]* PREFIX [^\n]*,100668
...,...,...
95,[^\n]* SELECT [^\n]* OPTIONAL [^\n]* ORDER BY ...,2342
96,[^\n]* PREFIX [^\n]* ORDER BY [^\n]*,2342
97,[^\n]* PREFIX [^\n]* OPTIONAL [^\n]* GROUP BY ...,2342
98,[^\n]* PREFIX [^\n]* OPTIONAL [^\n]* ORDER BY ...,2342


In [18]:
triples = []
for q in df['query']:
  triples.append(q)
  for op in ops:
    triples.append(f'{q} {op} [^\\n]*')

In [19]:
categ_path = os.path.join(PROJECT_DIR, f'exp/sparql/triples')
for i, rgx in enumerate(triples):
  exp_path = os.path.join(categ_path, f'exp{i:03}')
  os.makedirs(exp_path, exist_ok=True)
  if(os.path.exists(os.path.join(exp_path, 'doc.txt'))):
    os.remove(os.path.join(exp_path, 'doc.txt'))
  os.symlink(SPARQL, os.path.join(exp_path, 'doc.txt'))
  with open(os.path.join(exp_path, 'rematch.rgx'), 'w') as fp:
    fp.write(f'\\n!x{{{rgx}}}\\n')
  with open(os.path.join(exp_path, 'perl.rgx'), 'w') as fp:
    fp.write('(?m)' + f'^({rgx})$')
  with open(os.path.join(exp_path, 'boost.rgx'), 'w') as fp:
    fp.write(f'^({rgx})$')

In [21]:
crossing_ops = []
for r1 in df['query']:
  for r2 in df['query']:
    crossing_ops.append(f'{r1}\\n{r2}')

crossing_ops

['[^\\n]* WHERE [^\\n]*\\n[^\\n]* WHERE [^\\n]*',
 '[^\\n]* WHERE [^\\n]*\\n[^\\n]* SELECT [^\\n]* WHERE [^\\n]*',
 '[^\\n]* WHERE [^\\n]*\\n[^\\n]* SELECT [^\\n]*',
 '[^\\n]* WHERE [^\\n]*\\n[^\\n]* SELECT [^\\n]* WHERE [^\\n]*',
 '[^\\n]* WHERE [^\\n]*\\n[^\\n]* PREFIX [^\\n]*',
 '[^\\n]* WHERE [^\\n]*\\n[^\\n]* PREFIX [^\\n]* WHERE [^\\n]*',
 '[^\\n]* WHERE [^\\n]*\\n[^\\n]* PREFIX [^\\n]* WHERE [^\\n]*',
 '[^\\n]* WHERE [^\\n]*\\n[^\\n]* PREFIX [^\\n]* SELECT [^\\n]* WHERE [^\\n]*',
 '[^\\n]* WHERE [^\\n]*\\n[^\\n]* PREFIX [^\\n]* SELECT [^\\n]*',
 '[^\\n]* WHERE [^\\n]*\\n[^\\n]* PREFIX [^\\n]* SELECT [^\\n]*',
 '[^\\n]* WHERE [^\\n]*\\n[^\\n]* DISTINCT [^\\n]* WHERE [^\\n]*',
 '[^\\n]* WHERE [^\\n]*\\n[^\\n]* DISTINCT [^\\n]* WHERE [^\\n]*',
 '[^\\n]* WHERE [^\\n]*\\n[^\\n]* DISTINCT [^\\n]*',
 '[^\\n]* WHERE [^\\n]*\\n[^\\n]* PREFIX [^\\n]* PREFIX [^\\n]*',
 '[^\\n]* WHERE [^\\n]*\\n[^\\n]* PREFIX [^\\n]* PREFIX [^\\n]*',
 '[^\\n]* WHERE [^\\n]*\\n[^\\n]* PREFIX [^\\n]* PREFIX [

In [11]:
categ_path = os.path.join(PROJECT_DIR, f'exp/crossings/sparql/2lines')
for i, rgx in enumerate(crossing_ops):
  exp_path = os.path.join(categ_path, f'exp{i:04}')
  os.makedirs(exp_path, exist_ok=True)
  if(os.path.exists(os.path.join(exp_path, 'doc.txt'))):
    os.remove(os.path.join(exp_path, 'doc.txt'))
  os.symlink(SPARQL, os.path.join(exp_path, 'doc.txt'))
  with open(os.path.join(exp_path, 'rematch.rgx'), 'w') as fp:
    fp.write(f'\\n!x{{{rgx}}}\\n')
  with open(os.path.join(exp_path, 'perl.rgx'), 'w') as fp:
    fp.write('(?m)' + f'^({rgx})$')
  with open(os.path.join(exp_path, 'boost.rgx'), 'w') as fp:
    fp.write(f'^({rgx})$')

In [36]:
for key, value in tot_ops.items():
  categ_path = os.path.join(PROJECT_DIR, f'exp/sparql/{key}')
  os.makedirs(categ_path, exist_ok=True)
  for i, rgxs in enumerate(value):
    exp_path = os.path.join(categ_path, f'exp{i:02}')
    os.makedirs(exp_path, exist_ok=True)
    if(os.path.exists(os.path.join(exp_path, 'doc.txt'))):
      os.remove(os.path.join(exp_path, 'doc.txt'))
    os.symlink(SPARQL, os.path.join(exp_path, 'doc.txt'))
    with open(os.path.join(exp_path, 'rematch.rgx'), 'w') as fp:
      fp.write(f'\\n!x{{{ rgxs }}}\\n')
    with open(os.path.join(exp_path, 'perl.rgx'), 'w') as fp:
      fp.write('(?m)' + f'^({rgxs})$')
    with open(os.path.join(exp_path, 'boost.rgx'), 'w') as fp:
      fp.write(f'^({rgxs})$')

#### Consultas consecutivas (SPARQL)

In [42]:
import pprint

consecutive = dict()
consecutive['single'] = dict()

for k in [2,3,4]:
  consecutive['single'][k] = []
  for query in tot_ops['single']:
      multiq = []
      for j in range(1, k+1):
        multiq.append('\\n'.join(query for _ in range(j)))
      consecutive['single'][k].append(multiq)


pprint.pprint(consecutive)

{'single': {2: [['[^\\n]* OPTIONAL [^\\n]*',
                 '[^\\n]* OPTIONAL [^\\n]*\\n[^\\n]* OPTIONAL [^\\n]*'],
                ['[^\\n]* FILTER [^\\n]*',
                 '[^\\n]* FILTER [^\\n]*\\n[^\\n]* FILTER [^\\n]*'],
                ['[^\\n]* WHERE [^\\n]*',
                 '[^\\n]* WHERE [^\\n]*\\n[^\\n]* WHERE [^\\n]*'],
                ['[^\\n]* PREFIX [^\\n]*',
                 '[^\\n]* PREFIX [^\\n]*\\n[^\\n]* PREFIX [^\\n]*'],
                ['[^\\n]* SELECT [^\\n]*',
                 '[^\\n]* SELECT [^\\n]*\\n[^\\n]* SELECT [^\\n]*'],
                ['[^\\n]* DISTINCT [^\\n]*',
                 '[^\\n]* DISTINCT [^\\n]*\\n[^\\n]* DISTINCT [^\\n]*'],
                ['[^\\n]* LIMIT [^\\n]*',
                 '[^\\n]* LIMIT [^\\n]*\\n[^\\n]* LIMIT [^\\n]*'],
                ['[^\\n]* CONSTRUCT [^\\n]*',
                 '[^\\n]* CONSTRUCT [^\\n]*\\n[^\\n]* CONSTRUCT [^\\n]*'],
                ['[^\\n]* BIND [^\\n]*',
                 '[^\\n]* BIND [^\\n]*\\n[^\\n]*

In [46]:
for key, value in consecutive.items():
  categ_path = os.path.join(PROJECT_DIR, f'exp/sparql-multiline/{key}')
  os.makedirs(categ_path, exist_ok=True)
  for num, val2 in value.items():
    subcateg_path = os.path.join(categ_path, f'{num}lines')
    os.makedirs(subcateg_path, exist_ok=True)
    for i, rgxs in enumerate(val2):
      exp_path = os.path.join(subcateg_path, f'exp{i:03}')
      os.makedirs(exp_path, exist_ok=True)
      if(os.path.exists(os.path.join(exp_path, 'doc.txt'))):
        os.remove(os.path.join(exp_path, 'doc.txt'))
      os.symlink(SPARQL, os.path.join(exp_path, 'doc.txt'))
      with open(os.path.join(exp_path, 'rematch.rgx'), 'w') as fp:
        fp.write('|'.join(f'\\n!x{{{ r }}}\\n' for r in rgxs))
      with open(os.path.join(exp_path, 'perl.rgx'), 'w') as fp:
        fp.write('(?m)' + '|'.join(f'^({r})$' for r in rgxs))
      with open(os.path.join(exp_path, 'boost.rgx'), 'w') as fp:
        fp.write('|'.join(f'^({r})$' for r in rgxs))

#### Consultas consecutivas (ENGLISH)

In [4]:
from itertools import chain

duplicate = []


morphemes = list(chain(map(lambda w: rf'\w+{w}', suff),
                  map(lambda w: rf'{w}\w+', pref),
                  map(lambda w: rf'\w*{w}\w*', greek+latin)))



for m1 in morphemes:
  for m2 in morphemes:
    duplicate.append(rf'{m1}\W+{m2}')

morf = dict()
morf['2-grams'] = duplicate

In [14]:
for key, value in morf.items():
  categ_path = os.path.join(PROJECT_DIR, f'exp/morphemes-ngrams/{key}')
  os.makedirs(categ_path, exist_ok=True)
  for i, rgx in enumerate(value):
    exp_path = os.path.join(categ_path, f'exp{i:05}')
    os.makedirs(exp_path, exist_ok=True)
    if(os.path.exists(os.path.join(exp_path, 'doc.txt'))):
      os.remove(os.path.join(exp_path, 'doc.txt'))
    os.symlink(ENGLISH, os.path.join(exp_path, 'doc.txt'))
    with open(os.path.join(exp_path, 'rematch.rgx'), 'w') as fp:
      fp.write(f'\W!x{{{rgx}}}\W')
    with open(os.path.join(exp_path, 'perl.rgx'), 'w') as fp:
      fp.write(f'\W({rgx})\W')
    with open(os.path.join(exp_path, 'boost.rgx'), 'w') as fp:
      fp.write(f'\W({rgx})\W')

#### Motifs duplication

In [4]:
import pandas as pd

motifs_df = pd.read_csv(os.path.join(PROJECT_DIR, 'exp/dna/gt1_motifs.csv'))

Unnamed: 0,nexp,motif,noutputs
0,336,[ST].{2}[DE],699991
1,1,[ST].[RK],584269
2,338,G[^EDRKHPFYW].{2}[STAGCN][^P],471272
3,0,N[^P][ST][^P],225644
4,335,[RK]{2}.[ST],86203
...,...,...,...
697,197,[YH]C[VI][SA]CAIH,2
698,765,[LIVM]{2}[GSA].GG[IV].[STGDN].{3}[ACV].{2}[^A]...,2
699,152,[LIVMH]H[RT][GA].EK[LIVMTN].E.[KRQ],2
700,1738,N.{3}[DEH].{2}[LIMFYT]D.{2}[VM].R[ST].{2}R.{4}...,2


In [8]:
categ_path = os.path.join(PROJECT_DIR, f'exp/dna/motifs')
os.makedirs(categ_path, exist_ok=True)
for i, rgx in enumerate(motifs_df['motif']):
  exp_path = os.path.join(categ_path, f'exp{i:03}')
  os.makedirs(exp_path, exist_ok=True)
  if(os.path.exists(os.path.join(exp_path, 'doc.txt'))):
    os.remove(os.path.join(exp_path, 'doc.txt'))
  os.symlink(DNA, os.path.join(exp_path, 'doc.txt'))
  with open(os.path.join(exp_path, 'rematch.rgx'), 'w') as fp:
    fp.write(f'!x{{{rgx}}}')
  with open(os.path.join(exp_path, 'perl.rgx'), 'w') as fp:
    fp.write(f'({rgx})')
  with open(os.path.join(exp_path, 'boost.rgx'), 'w') as fp:
    fp.write(f'({rgx})')

In [9]:
for k in [10, 100, 1000, 10000]:
  categ_path = os.path.join(PROJECT_DIR, f'exp/dna-duplicate/motifs-{k}')
  os.makedirs(categ_path, exist_ok=True)
  for i, rgx in enumerate():
      fp.write(f'!x{{{rgx}.{{0,{k}}}{rgx}}}')
    with open(os.path.join(exp_path, 'perl.rgx'), 'w') as fp:
      fp.write(f'({rgx}.{{0,{k}}}{rgx})')
    with open(os.path.join(exp_path, 'boost.rgx'), 'w') as fp:
      fp.write(f'({rgx}.{{0,{k}}}{rgx})')(motifs_df['motif']):
    exp_path = os.path.join(categ_path, f'exp{i:03}')
    os.makedirs(exp_path, exist_ok=True)
    if(os.path.exists(os.path.join(exp_path, 'doc.txt'))):
      os.remove(os.path.join(exp_path, 'doc.txt'))
    os.symlink(DNA, os.path.join(exp_path, 'doc.txt'))
    