In [1]:
%load_ext autoreload
%autoreload 2

This attempts an extremely simple neural reranker experiment.  Motivated by "Passage Re-ranking with BERT", 

https://arxiv.org/abs/1901.04085
https://github.com/nyu-dl/dl4marco-bert
Passage Re-ranking with BERT
Rodrigo Nogueira, Kyunghyun Cho

We train a model where we start with the query [CLS] followed by the lemma, and then have a binary decision.

In [2]:
import collections
from collections import Counter
from pprint import pprint
from tqdm import tqdm
import pandas as pd
from pathlib import Path
from pprint import pprint
from sklearn.model_selection import train_test_split
from IPython.display import JSON
from tqdm import tqdm
import numpy as np
import json
from enum import Enum
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import thundersvm

from coprover import PROJ_ROOT, RSC_ROOT, PVSLIB_ROOT
from featurizer import *

DATA_ROOT = Path(PROJ_ROOT, "data", "pvs", "pvslib")
json_files = list(DATA_ROOT.rglob("*.json"))

theory_files = list(DATA_ROOT.glob("*/*.json"))
proof_files = list(DATA_ROOT.glob("*/*/*.json"))
print(len(theory_files), len(proof_files))

import pdb

class VecTypes:
    COUNT = 1
    TFIDF = 2

class TheoryBank:
    def __init__(self, theory_files, 
                 vectorizer_type=VecTypes.COUNT,
                 norm_vecs=True):
        self.all_theories = collections.OrderedDict()
        self.all_lemmas = collections.OrderedDict()
        for json_fpath in tqdm(theory_files):
            with open(json_fpath, 'r') as f:
                theory_name = json_fpath.stem
                doc_root = json.load(f)
                theory = read_theory(doc_root)
                self.all_theories[theory_name] = theory
                self.all_lemmas.update(theory)
        corpus = [" ".join([str(y) for y in x]) for x in self.all_lemmas.values()]
        self.names = list(self.all_lemmas.keys())
        if vectorizer_type == VecTypes.COUNT:
            self.vectorizer = CountVectorizer(stop_words=None, lowercase=False)
        elif vectorizer_type == VecTypes.TFIDF:
            self.vectorizer = TfidfVectorizer(stop_words=None, lowercase=False)
        self.M = self.vectorizer.fit_transform(corpus).toarray()
        # 1-norm
        self.norm_vecs = norm_vecs
        if self.norm_vecs:
            self.M = self.M / np.linalg.norm(self.M, axis=1).reshape((self.M.shape[0], 1))
        
    def contains(self, name):
        return name in self.names
        
    def query(self, qdocs, top_N=5):
        """
        Expects a list of lists (docs x toks)
        """
        q = self.vectorizer.transform(qdocs).toarray()
        if self.norm_vecs:
            q = q / np.linalg.norm(q, axis=0)
        num_queries = len(qdocs)
        S = self.M.dot(q.transpose())
        sorted_idxes = np.argsort(S, axis=0)
        if top_N is None:
            max_idxes = sorted_idxes
        else:
            max_idxes = sorted_idxes[-top_N:, :]
        
        # Assemble list of names
        titles = []
        for qnum in range(len(qdocs)):
            titles.append([self.names[idx] for idx in max_idxes[::-1, qnum]])
        return titles
    
    def get(self, name, theories=None):
        if theories is None:
            theories = sorted(self.all_theories.keys())
        for theory in theories:
            if name in self.all_theories[theory]:
                return self.all_theories[theory][name], theory
        return None, None
        # raise Exception("Could not identify name={} in theory set={}".format(name, theories))
    
    def sample(self, rand_obj=None):
        if rand_obj is None:
            return np.random.choice(list(self.all_lemmas.values()))
        else:
            return rand_obj.choice(list(self.all_lemmas.values()))
        


615 9216


In [3]:
theory_bank = TheoryBank(theory_files)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 615/615 [00:07<00:00, 82.25it/s]


In [4]:
# Now go through each of the proofs and collect the lemma requests

LEMMA_OUTPUT_FPATH = Path("lemma_requests.json")
NAME = "name"
STEP = "step"
STATE = "state"
CMD = "command"
ARGS = "args"

if LEMMA_OUTPUT_FPATH.exists():
    # Load the lemma queries
    print("Lemma queries cached, loading")
    with open(LEMMA_OUTPUT_FPATH, 'r') as f:
        lemma_requests = json.load(f)
else:
    # Accumulate the lemma retrieval queries
    tqdm_iter = tqdm(proof_files)
    num_lemma_requests = 0
    lemma_requests = []


    for json_fpath in tqdm_iter:
        name = Path(json_fpath).stem
        sa_tuples = read_proof_session_lemmas(json_fpath)
        if sa_tuples is None:
            continue
        for step_num, sa_tuple in enumerate(sa_tuples):
            cmd, arg = sa_tuple[1], sa_tuple[2]
            if arg is not None and isinstance(arg, str):
                arg = arg.split("[")[0]
                if cmd in set(["lemma", "rewrite"]):
                    # expr, theory = theory_bank.get(arg)
                    num_lemma_requests += 1
                    lemma_requests.append({
                        STATE:  sa_tuple[0],
                        CMD: cmd,
                        ARGS: arg,
                        NAME: name,
                        STEP: step_num
                    })
                    tqdm_iter.set_description("# lemma requests={}".format(num_lemma_requests))

    # Save out the accumulated lemma requests to file                
    with open(LEMMA_OUTPUT_FPATH, 'w') as f:
        json.dump(lemma_requests, f)
        


Lemma queries cached, loading


In [5]:

# Split into train/test
from sklearn.model_selection import train_test_split
lemma_train, lemma_test = train_test_split(
     lemma_requests, train_size=0.6, random_state=1337, shuffle=True, stratify=None)
print("# train={}, test={}, total={}".format(len(lemma_train), len(lemma_test), len(lemma_requests)))

# train=12132, test=8089, total=20221


In [6]:
# Vectorize a query
def make_statestr(state, consequents_only=False):
    """ Given lemma state, converts into query string usable for theory bank"""
    collecting = not(consequents_only)
    toks = []
    for tok in state:
        if tok == "consequents":
            collecting = True
        elif collecting:
            toks.append(str(tok))
    return " ".join(toks)


In [7]:
# Make several theory banks
count_tb_nonorm = TheoryBank(theory_files, vectorizer_type=VecTypes.COUNT, norm_vecs=False)
tfidf_tb_nonorm = TheoryBank(theory_files, vectorizer_type=VecTypes.TFIDF, norm_vecs=False)
count_tb_norm = TheoryBank(theory_files, vectorizer_type=VecTypes.COUNT, norm_vecs=True)
tfidf_tb_norm = TheoryBank(theory_files, vectorizer_type=VecTypes.TFIDF, norm_vecs=True)

experiments = ( ("Count_NoNorm", count_tb_nonorm), ("TFIDF_NoNorm", tfidf_tb_nonorm),
              ("Count_Norm", count_tb_norm), ("TFIDF_Norm", tfidf_tb_norm))

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 615/615 [00:02<00:00, 288.16it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 615/615 [00:02<00:00, 288.93it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 615/615 [00:02<00:00, 274.53it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 615/615 [00:02<00:00, 282.40it/s]


In [8]:
def compute_mrr(theory_bank):
    retrieval_ranks = []
    for req in tqdm(lemma_test):
        gold = req[NAME]
        if not(theory_bank.contains(gold)):
            continue
        state_str = make_statestr(req[STATE])
        retrieved = theory_bank.query([state_str], top_N=None)
        rank = retrieved[0].index(gold)
        retrieval_ranks.append(rank)
    mrr = np.mean([1/(rank + 1) for rank in retrieval_ranks])
    return mrr

# Supervised experiment.  
For each of the training items, set up a featurization that consists of q

In [9]:
from numpy.random import default_rng
import pdb
np_rng = default_rng(505)

def assemble_data(lemma_src, tb, limit=100):
    """
    Get one positive, exact match
    One easy negative, random sample
    One hard negative, random sample that is close
    """
    for i in tqdm(range(limit)):
        req = lemma_src[i]
        query_state_str = make_statestr(req[STATE])
        gold_lemma = tb.get(req[NAME])
        gold_lemma_str = make_statestr(gold_lemma)
        random_lemma_str = make_statestr(tb.sample())
        
        # Get the hard negative
        S = tb.M.dot(gold_x.transpose())
        sorted_idxes = np.argsort(S, axis=0)
        hard_neg_idx = np_rng.integers(low=1, high=len(sorted_idxes), size=1)[0]
        tb.
        hard_neg_name = tb.names[hard_neg_idx]
        hard_neg_lemma, _ = tb.get(hard_neg_name)
        hard_neg_str = make_statestr(hard_neg_lemma)


In [None]:
from sklearn import svm
# import sklearn.svm as svm
# Try ThunderSVM, GPU accelerated SVC
#num_train = 1000
num_train = 10
num_test = 10

svc_results = collections.OrderedDict()
for expname, theory_bank in experiments:
    svc = svm.LinearSVC()
    #svc = thundersvm.SVC()
    train_X, train_Y = assemble_data(lemma_train, theory_bank, num_train)
    test_X, test_Y = assemble_data(lemma_test, theory_bank, num_test)
    print("Data assembled, fitting...")
    svc.fit(train_X, train_Y)
    print("... testing")
    train_Yhat = svc.predict(train_X)
    test_Yhat = svc.predict(test_X)
    train_acc = np.sum(train_Yhat == train_Y) / train_X.shape[0]
    test_acc = np.sum(test_Yhat == test_Y) / test_X.shape[0]
    svc_results[expname] = {
        "train_acc": train_acc,
        "test_acc": test_acc
    }
    print("{}: train_acc={:.3f}, test_acc={:.3f}".format(expname, train_acc, test_acc))

  return np.random.choice(list(self.all_lemmas.values()))
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 16.10it/s]
  X = np.array(X)
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 16.78it/s]
