Implements a simple baseline that uses classic techniques as baselines

In [24]:
import pandas as pd
import numpy as np
import sklearn
from pathlib import Path

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split

from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report



In [38]:
output_root = Path("outputs")
output_root.mkdir(exist_ok=True, parents=True)

inst_df = pd.read_csv("laststep_pred.v1.csv.gz")
train_df, test_df = train_test_split(inst_df, random_state=501, test_size=0.1)
print(len(train_df), len(test_df))

11162 1241


In [49]:
train_df

Unnamed: 0.1,Unnamed: 0,proofname,source_text,cmd_history,target_text
7375,7375,convex_functions-proofs/convex_const_on_connec...,cross-mult simplify simplify <ANT> <CONS> s-fo...,cross-mult simplify simplify,neg
11790,11790,traces-proofs/terminating_finite_traces_j,lemma instantiate inst? <ANT> <CONS> s-formula...,lemma instantiate inst?,neg
4266,4266,real_orders-proofs/lt_ne_gt,NOOP decompose-equality inst <ANT> s-formula a...,NOOP decompose-equality inst,pos
904,904,cont_vect2_real-proofs/inv_fun_continuous_vr,instantiate inst? lemma <ANT> <CONS> s-formula...,instantiate inst? lemma,pos
11005,11005,sigma-proofs/sigma_downwards_TCC2,NOOP skolem skeep <ANT> <CONS> s-formula foral...,NOOP skolem skeep,neg
...,...,...,...,...,...
7648,7648,derivative_props-proofs/mean_value_abs,instantiate inst? simplify <ANT> <CONS> s-form...,instantiate inst? simplify,neg
8783,8783,limit_vect2_vect2-proofs/lim_const_fun,NOOP NOOP auto-rewrite <ANT> <CONS> s-formula ...,NOOP NOOP auto-rewrite,neg
2727,2727,lines_2D-proofs/test3,expand inst expand <ANT> <CONS> s-formula fora...,expand inst expand,pos
671,671,cardinal-proofs/cardinal_lt,split ground simplify <ANT> <CONS> s-formula f...,split ground simplify,pos


In [13]:
Y_train = np.zeros(len(train_df))
Y_train[train_df.target_text == 'pos'] = 1

Y_test = np.zeros(len(test_df))
Y_test[test_df.target_text == 'pos'] = 1


In [41]:
def featurize_cmds(df):
    datums = []
    for cmds in df.cmd_history.array:
        datum = {}
        for idx, cmd in enumerate(cmds.split()):
            datum["{}_{}".format(cmd, idx)] = 1
        datums.append(datum)
    return datums

In [42]:
cmd_vectorizer = DictVectorizer(sparse=False)
train_datums = featurize_cmds(train_df)
test_datums = featurize_cmds(test_df)
cmd_vectorizer.fit(train_datums)
X_train = cmd_vectorizer.transform(train_datums)
X_test = cmd_vectorizer.transform(test_datums)


In [43]:
len(train_datums)

11162

In [44]:
print(X_train.shape, Y_train.shape)

(11162, 233) (11162,)


In [45]:

clf = make_pipeline(StandardScaler(), LinearSVC(random_state=0, tol=1e-5))
clf.fit(X_train, Y_train)



Pipeline(steps=[('standardscaler', StandardScaler()),
                ('linearsvc', LinearSVC(random_state=0, tol=1e-05))])

In [91]:
num_pos = np.sum(Y_train)
num_neg = len(Y_train) - num_pos
print(num_pos, num_neg)
if num_neg > num_pos:
    print("Train majority guess is negative")
    Y_baseline = np.zeros(Y_hat.shape) # Majority negative
else:
    print("Train majority guess is positive")
    Y_baseline = np.ones(Y_hat.shape)


5578.0 5584.0
Train majority guess is negative


In [92]:

Y_hat = clf.predict(X_test)
print("Classification Report, Linear SVC on command window only")
print(classification_report(Y_hat, Y_test, target_names=['neg', 'pos']))

print("Classification Report, Baseline Guess")
print(classification_report(Y_baseline, Y_test, target_names=['neg', 'pos']))

Classification Report, Linear SVC on command window only
              precision    recall  f1-score   support

         neg       0.55      0.71      0.62       479
         pos       0.78      0.64      0.70       762

    accuracy                           0.66      1241
   macro avg       0.66      0.67      0.66      1241
weighted avg       0.69      0.66      0.67      1241

Classification Report, Baseline Guess
              precision    recall  f1-score   support

         neg       1.00      0.50      0.66      1241
         pos       0.00      0.00      0.00         0

    accuracy                           0.50      1241
   macro avg       0.50      0.25      0.33      1241
weighted avg       1.00      0.50      0.66      1241



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [55]:
# Now try by featurizing the state tokens, without the command history
def featurize_state(df):
    datums = []
    for raw_state in df.source_text.array:
        raw_state = " ".join(raw_state.split()[3:]) # Get everything but the commands
        datums.append(raw_state)
    return datums

In [62]:
tfidf_vectorizer = TfidfVectorizer(sparse=False)
train_docs = featurize_state(train_df)
print(len(train_docs))
tfidf_vectorizer.fit(train_docs)

TypeError: __init__() got an unexpected keyword argument 'sparse'

In [63]:
X_state_train = tfidf_vectorizer.transform(train_docs).todense()

In [64]:
test_docs = featurize_state(test_df)
print(len(test_docs))
X_state_test = tfidf_vectorizer.transform(test_docs).todense()

1241


In [65]:
state_clf = make_pipeline(StandardScaler(), LinearSVC(random_state=0, tol=1e-5))
state_clf.fit(X_state_train, Y_train)



Pipeline(steps=[('standardscaler', StandardScaler()),
                ('linearsvc', LinearSVC(random_state=0, tol=1e-05))])

In [93]:
print("State based SVC Results")
Y_state_hat = state_clf.predict(X_state_test)
print(classification_report(Y_state_hat, Y_test, target_names=['neg', 'pos']))

print("Baseline guess results")
print(classification_report(Y_baseline, Y_test, target_names=['neg', 'pos']))

State based SVC Results
              precision    recall  f1-score   support

         neg       0.53      0.61      0.57       531
         pos       0.67      0.59      0.63       710

    accuracy                           0.60      1241
   macro avg       0.60      0.60      0.60      1241
weighted avg       0.61      0.60      0.60      1241

Baseline guess results
              precision    recall  f1-score   support

         neg       1.00      0.50      0.66      1241
         pos       0.00      0.00      0.00         0

    accuracy                           0.50      1241
   macro avg       0.50      0.25      0.33      1241
weighted avg       1.00      0.50      0.66      1241



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [88]:
Y_baseline = np.zeros(Y_hat.shape)
Y_baseline

array([0., 0., 0., ..., 0., 0., 0.])