In [50]:
import pandas as pd
import os
import os.path
import csv
from math import log10, ceil, floor
import re
import random
from pprint import pprint

here = os.path.dirname(os.path.abspath("__file__"))
PROJECT_DIR = os.path.abspath(os.path.join(here, "../.."))

ENGLISH = os.path.join(PROJECT_DIR, 'datasets/english-works/english-authors.txt')
SPARQL = os.path.join(PROJECT_DIR, 'datasets/RKBExplorer/sparql.log.1')

In [60]:
def split_in_bins(df):
  df_zeros = df[df["noutputs"] > 0]
  df_zeros = df_zeros.sort_values(by=['noutputs'], ascending=False)
  max_val = df_zeros['noutputs'].iloc[0]
  min_val = df_zeros['noutputs'].iloc[-1]
  output_dfs = []
  for i in range(floor(log10(min_val)), ceil(log10(max_val))):
    ndf = df_zeros[(10**i <= df_zeros['noutputs']) & (df_zeros['noutputs'] < 10**(i+1))]
    ndf = ndf.assign(bin_range_min=10**i)
    ndf = ndf.assign(bin_range_max=10**(i+1))
    output_dfs.append(ndf)
  return output_dfs

def get_sample_queries(df):
  bins = split_in_bins(df)
  min_len = min(map(len, bins))
  sample_queries = []
  for dframe in bins[::-1]:
    dframe = dframe.sample(frac=1)
    dframe.reset_index(inplace=True, drop=True)
    sample_queries.append(dframe.head(min_len))

  return pd.concat(sample_queries)

def get_first_rows(df, nrows):
  df_zeros = df[df["noutputs"] > 0]
  df_zeros = df_zeros.sort_values(by=['noutputs'], ascending=False)
  df_zeros = df_zeros.reset_index(drop=True)
  return df_zeros.head(nrows)

def to_csv(df, filename):
  df.to_csv(os.path.join(here, filename), quoting=csv.QUOTE_NONNUMERIC)

In [66]:
df = pd.read_csv(os.path.join(here, 'multi-ops.csv'), index_col=[0])

df['query'] = df['query'].map(lambda x: re.search('!x{(.+)}', x).group(1))

df = get_sample_queries(df)
df


Unnamed: 0,query,noutputs,bin_range_min,bin_range_max
0,[^\n]* WHERE [^\n]*\n[^\n]* PREFIX [^\n]* SELE...,100617,100000,1000000
1,[^\n]* SELECT [^\n]* WHERE [^\n]*\n[^\n]* SELE...,101196,100000,1000000
2,[^\n]* PREFIX [^\n]*\n[^\n]* WHERE [^\n]*,100633,100000,1000000
3,[^\n]* SELECT [^\n]* WHERE [^\n]*\n[^\n]* WHER...,105706,100000,1000000
4,[^\n]* PREFIX [^\n]* WHERE [^\n]*\n[^\n]* WHER...,100621,100000,1000000
...,...,...,...,...
29,[^\n]* OPTIONAL [^\n]* FILTER [^\n]*\n[^\n]* S...,29,10,100
30,[^\n]* PREFIX [^\n]* ORDER BY [^\n]*\n[^\n]* O...,54,10,100
31,[^\n]* AS [^\n]* WHERE [^\n]*\n[^\n]* OPTIONAL...,73,10,100
32,[^\n]* SELECT [^\n]* OPTIONAL [^\n]* OPTIONAL ...,67,10,100


In [69]:
for i, row in df.iterrows():
  exp_path = os.path.join(PROJECT_DIR, f'exp/crossings/sparql/2lines/sample/k{row["bin_range_max"]}/exp{i:03}')
  os.makedirs(exp_path, exist_ok=True)
  if(os.path.exists(os.path.join(exp_path, 'doc.txt'))):
    os.remove(os.path.join(exp_path, 'doc.txt'))
  os.symlink(SPARQL, os.path.join(exp_path, 'doc.txt'))
  with open(os.path.join(exp_path, 'rematch.rgx'), 'w') as fp:
    fp.write(f'!x{{\\n{row["query"]}\\n}}')
  with open(os.path.join(exp_path, 'perl.rgx'), 'w') as fp:
    fp.write(f'(\\n{row["query"]}\\n)')
  with open(os.path.join(exp_path, 'boost.rgx'), 'w') as fp:
    fp.write(f'(\\n{row["query"]}\\n)')

In [14]:
import itertools

for i, s in enumerate(itertools.combinations(df['query'], 3)):
  exp_path = os.path.join(PROJECT_DIR, f'exp/morphemes/40choose3/exp{i:04}')
  os.makedirs(exp_path, exist_ok=True)
  if(os.path.exists(os.path.join(exp_path, 'doc.txt'))):
    os.remove(os.path.join(exp_path, 'doc.txt'))
  os.symlink(ENGLISH, os.path.join(exp_path, 'doc.txt'))
  capture = f"{s[0]}|{s[1]}|{s[2]}"
  with open(os.path.join(exp_path, 'rematch.rgx'), 'w') as fp:
    fp.write(f'\W!x{{(({capture})\W+){{5}}({capture})}}\W')
  with open(os.path.join(exp_path, 'perl.rgx'), 'w') as fp:
    fp.write(f'\W((?:(?:{capture})\W+){{5}}(?:{capture}))\W')
  with open(os.path.join(exp_path, 'boost.rgx'), 'w') as fp:
    fp.write(f'\W((?:(?:{capture})\W+){{5}}(?:{capture}))\W')

In [135]:
df = pd.read_csv(os.path.join(here, 'morpheme.csv'), index_col=[0])
df = get_first_rows(df, 100)
df['query'] = df['query'].map(lambda x: re.fullmatch('\\\\W!x{(.+)}\\\\W', x).group(1))

In [136]:
cross = []
for i, r1 in df.iterrows():
  for j, r2 in df.iterrows():
    cross.append(f'{r1["query"]}\W{r2["query"]}')

cross

['a\\w+\\Wa\\w+',
 'a\\w+\\Wan\\w+',
 'a\\w+\\W\\w+er',
 'a\\w+\\Wco\\w+',
 'a\\w+\\W\\w+ly',
 'a\\w+\\Wbe\\w+',
 'a\\w+\\Wre\\w+',
 'a\\w+\\W\\w+or',
 'a\\w+\\W\\w+ion',
 'a\\w+\\Win\\w+',
 'a\\w+\\Wde\\w+',
 'a\\w+\\Wdi\\w+',
 'a\\w+\\W\\w+ent',
 'a\\w+\\W\\w+geo?\\w+',
 'a\\w+\\W\\w+al',
 'a\\w+\\Wcom\\w+',
 'a\\w+\\Wcon\\w+',
 'a\\w+\\Wun\\w+',
 'a\\w+\\Wex\\w+',
 'a\\w+\\Wen\\w+',
 'a\\w+\\Wpro\\w+',
 'a\\w+\\W\\w+ation',
 'a\\w+\\W\\w+ive',
 'a\\w+\\Wdis\\w+',
 'a\\w+\\W\\w+ous',
 'a\\w+\\W\\w+ate',
 'a\\w+\\W\\w+ure',
 'a\\w+\\Wpre\\w+',
 'a\\w+\\W\\w+ment',
 'a\\w+\\W\\w+ence',
 'a\\w+\\W\\w+able',
 'a\\w+\\W\\w+ant',
 'a\\w+\\W\\w+ity',
 'a\\w+\\W\\w+ness',
 'a\\w+\\W\\w+ance',
 'a\\w+\\W\\w+ful',
 'a\\w+\\Wim\\w+',
 'a\\w+\\W\\w+age',
 'a\\w+\\W\\w+ish',
 'a\\w+\\W\\w+ic',
 'a\\w+\\W\\w+ary',
 'a\\w+\\W\\w+mi[ts]\\w+',
 'a\\w+\\W\\w+bio?\\w+',
 'a\\w+\\W\\w+ise',
 'a\\w+\\Wbi\\w+',
 'a\\w+\\W\\w+less',
 'a\\w+\\Winter\\w+',
 'a\\w+\\Wmis\\w+',
 'a\\w+\\W\\w+ible',
 'a\\w+\\Wc

In [142]:
for i, regex in enumerate(cross):
  exp_path = os.path.join(PROJECT_DIR, f'exp/morphemes-ngrams/exp{i:04}')
  os.makedirs(exp_path, exist_ok=True)
  if(os.path.exists(os.path.join(exp_path, 'doc.txt'))):
    os.remove(os.path.join(exp_path, 'doc.txt'))
  os.symlink(ENGLISH, os.path.join(exp_path, 'doc.txt'))
  with open(os.path.join(exp_path, 'rematch.rgx'), 'w') as fp:
    fp.write(f'\W!x{{{regex}}}\W')
  with open(os.path.join(exp_path, 'perl.rgx'), 'w') as fp:
    fp.write(f'\W({regex})\W')
  with open(os.path.join(exp_path, 'boost.rgx'), 'w') as fp:
    fp.write(f'\W({regex})\W')