Converts current lemmas into corpus files and trains a Huggingface BPE Tokenizer vocabulary model.

In [1]:
from IPython.display import JSON
import json

from featurizer import *
from theorybank import *


615 9205


In [2]:
theorybank = gen_default_theorybank()

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 615/615 [00:01<00:00, 345.37it/s]


In [17]:
OUTPUTS_DIR = Path("outputs")
OUTPUTS_DIR.mkdir(exist_ok=True, parents=True)

In [26]:
# Set up for training a vocabulary
# Assemble a corpus containing all of the lemmas, and then all of the sequents where the lemma requests are made
# NOTE: May want to save commands as bpe encoded tokens, given new commands may be coded up.

with open("lemma_requests.json", "r") as f:
    lemma_requests = json.load(f)

corpus_fpath = Path(OUTPUTS_DIR, "corpus.txt")
commands_fpath = Path(OUTPUTS_DIR, "commands.txt")
with open(corpus_fpath, 'w') as f:
    with open(commands_fpath, 'w') as cmd_f:
        for name, lemma_body in theorybank.all_lemmas.items():
            f.write(" ".join([str(x) for x in lemma_body]))
            f.write("\n")
        for lreq in lemma_requests:
            f.write(" ".join([str(x) for x in lreq['state']]))
            f.write("\n")
            cmd_f.write("{}\n".format(lreq['command']))


In [29]:
# Now train the vocabulary model

from pathlib import Path
from collections import Counter
from tokenizers import ByteLevelBPETokenizer

from coprover import RSC_ROOT

Path("outputs/nir_vocab").mkdir(exist_ok=True, parents=True)

# Special tokens used by sequent representation and for Roberta model
BOS_TOKEN = "<s>"
EOS_TOKEN = "</s>"
SEP_TOKEN = "<sep>"
CLS_TOKEN = "<cls>"
UNK_TOKEN = "<unk>"
PAD_TOKEN = "<pad>"
MASK_TOKEN = "<mask>"

SPEC_TOKS=[
    "antecedent",
    "consequent",
    "hidden",
    "null",
    BOS_TOKEN,
    EOS_TOKEN,
    SEP_TOKEN,
    CLS_TOKEN,
    UNK_TOKEN,
    PAD_TOKEN,
    MASK_TOKEN
    ]

corpus_files = [Path(OUTPUTS_DIR, "corpus.txt")]

# Get vocab histogram
tok_freq = Counter()
for fpath in corpus_files:
    with open(fpath, 'r') as f:
        for line in f:
            toks = line.strip().split()
            tok_freq.update(toks)

vocab_size = len(tok_freq)
tokenizer = ByteLevelBPETokenizer()
tokenizer.train(files=[str(x) for x in corpus_files], vocab_size=vocab_size, min_frequency=1, special_tokens=SPEC_TOKS)

# NOTE: Need to save_model and then save for all the necessary files.
tokenizer.save_model("outputs/nir_vocab")
tokenizer.save("outputs/nir_vocab/config.json")

Counter({'argument': 1202890,
         'operator': 1169116,
         'subtype': 382791,
         'null': 338658,
         'new?': 162318,
         'formula': 162318,
         'asserted?': 162318,
         'integer-value': 161385,
         'bindings': 120907,
         'expression': 120009,
         'booleans__NOT': 116591,
         'equalities__equal': 102606,
         'number_fields__times': 86352,
         'number_fields__difference__1': 74974,
         'functiontype': 73796,
         '0': 71120,
         '1': 68024,
         'number_fields__plus': 65961,
         'variableName': 64993,
         'constantName': 47012,
         'booleans__AND': 42226,
         'booleans__IMPLIES': 38377,
         'reals__lesseqp': 34782,
         'exponentiation__caret': 33951,
         'reals__lessp': 33901,
         'field': 29236,
         '2': 25480,
         'real_defs__abs': 24351,
         'sigma__sigma': 23456,
         'polynomials__polynomial': 20313,
         'current-goal': 20221,
         