In [1]:
import os
import os.path as path
from pprint import pprint
import re
import json
import random
from shutil import copytree

HERE = path.dirname(path.abspath("__file___"))
PROJECT_DIR = path.abspath(path.join(HERE, "../.."))

DATASETS = {
    "dna": {
        "exp-path": path.join(PROJECT_DIR, "exp/crossings/dna/1var"),
        "splitting-regex": "^!x\{(.*)\.\{0,20\}(.*)\}$",
    },
    "dna_lookahead": {
        "exp-path": path.join(PROJECT_DIR, "exp/crossings/dna/lookahead"),
    },
    "dna_2vars": {
        "exp-path": path.join(PROJECT_DIR, "exp/crossings/dna/2vars"),
        "splitting-regex": "^!x\{(.*)\.\{0,20\}(.*)\}$",
    },
    "morphemes": {
        "exp-path": path.join(PROJECT_DIR, "exp/crossings/morphemes/2grams"),
        "splitting-regex": R"^\\W!x\{(.*?)\\W(.*?)\}\\W$",
    },
    "morphemes_lookahead": {
        "exp-path": path.join(PROJECT_DIR, "exp/crossings/morphemes/lookahead"),
    },
    "morphemes_2vars": {
        "exp-path": path.join(PROJECT_DIR, "exp/crossings/morphemes/2vars"),
        "splitting-regex": "^!x\{(.*)\.\{0,20\}(.*)\}$",
    },
    "sparql": {
        "exp-path": path.join(PROJECT_DIR, "exp/crossings/sparql/1-var/2lines"),
        "splitting-regex": R"^!x\{\\n(.*\[\^\\n\]\*)\\n(\[\^\\n\]\*.*)\\n\}$"
    },
    "sparql_lookahead": {
        "exp-path": path.join(PROJECT_DIR, "exp/crossings/sparql/1-var/lookahead"),
    }
}


def get_subdirs(parent_dir: str):
    return [f.path for f in os.scandir(parent_dir) if f.is_dir()]

In [2]:

def get_doubles(dataset: str):
  doubles_dict = dict()
  for d1 in get_subdirs(path.join(DATASETS[dataset]['exp-path'], 'sample')):
    k = int(re.match('^k(\d+)$', path.basename(d1)).group(1))
    doubles_dict[k] = []
    print(f"On k={k}")
    for d2 in get_subdirs(d1):
      with open(path.join(d2, 'rematch.rgx')) as rgxf:
        rematch_exp = rgxf.read()
        # print(rematch_exp, DATASETS[dataset]['splitting-regex'])
        m = re.match(DATASETS[dataset]['splitting-regex'], rematch_exp)
        # print(m.group(1), m.group(2), sep=" (...) ")
        doubles_dict[k].append((m.group(1), m.group(2)))

  
  return dict(sorted(doubles_dict.items()))

def save_doubles(dataset: str): 
  with open(path.join(DATASETS[dataset]['exp-path'], 'cross_product.json'), 'w') as jsonfile:
    jsonfile.write(json.dumps(get_doubles(dataset), indent = 4))
  
def retrieve_small_sample(dataset: str, sample_size: int = 30):
  sample_dir = path.join(DATASETS[dataset]['exp-path'], 'sample')
  small_sample_dir = path.join(DATASETS[dataset]['exp-path'], 'small-sample')

  os.makedirs(small_sample_dir, exist_ok = True)

  exps_per_size_sample = sample_size // len(get_subdirs(sample_dir))

  for i, d1 in enumerate(get_subdirs(sample_dir)):
    subdirs2 = get_subdirs(d1)
    random.shuffle(subdirs2)
    for j in range(exps_per_size_sample):
      chosen_dir = subdirs2[j]
      dest_dir = path.join(small_sample_dir, f"exp{i*exps_per_size_sample+j:03}")
      copytree(chosen_dir, dest_dir, dirs_exist_ok=True)
      


In [4]:
# retrieve_small_sample("sparql")
# save_doubles("sparql")
retrieve_small_sample("sparql_lookahead")