This evaluates the last command prediction model trained in 230209_train_t5_v1.py.

This uses the same train test split obtained from 230209_data_setup.ipynb

Initially, the results seem promising against a baseline of guessing negative (~50% accuracy).  On the test set we get:

             precision    recall  f1-score   support

         neg       0.82      0.56      0.67       615
         pos       0.67      0.88      0.76       626

    accuracy                           0.72      1241
   macro avg       0.75      0.72      0.71      1241
weighted avg       0.75      0.72      0.71      1241


In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import torch
from pathlib import Path
from tqdm import tqdm
from sklearn.metrics import classification_report

from exp_setup import *
from coprover.training.simplet5 import SimpleT5

# T5_MODEL_FPATH = Path("outputs", "laststep_red_v1", "best_model")
T5_MODEL_FPATH = Path("outputs", "/home/fennel2/yeh/proj/CoProver/src/notebooks/230209_completion_measure/outputs/laststep_pred_nocmds_v1", "best_model")
# T5_MODEL_FPATH = Path("outputs", "/home/fennel2/yeh/proj/CoProver/src/notebooks/230209_completion_measure/outputs/laststep_pred_cmdsonly_v1", "best_model")

Global seed set to 1337


In [2]:
model = SimpleT5(source_max_token_len=SRC_MAX_TOKLEN, target_max_token_len=TGT_MAX_TOKLEN)
model.load_model(T5_MODEL_FPATH, use_gpu=True, use_device="cuda:1")

Loading model_type=t5, dir=/home/fennel2/yeh/proj/CoProver/src/notebooks/230209_completion_measure/outputs/laststep_pred_nocmds_v1/best_model, use_gpu=True


In [3]:
train_df, test_df = setup_laststep_pred_data(strip_cmdhistory=True, cmds_only=False)

Len train=11162, test=1241


In [6]:
print(np.sum(train_df.target_text == "pos"))
print(np.sum(train_df.target_text == "neg"))

5578
5584


In [7]:
print(np.sum(test_df.target_text == "pos"))
print(np.sum(test_df.target_text == "neg"))

626
615


In [4]:
idx = 1000
Y_guess = []
Y_gold = train_df.target_text.array[0:idx]
for src_txt in tqdm(train_df.source_text.array[0:idx]):
    Y_guess.append(model.predict(src_txt)[0])

print(classification_report(Y_gold, Y_guess, target_names=['neg', 'pos']))

  1%|██▏                                                                                                                                                                    | 13/1000 [00:00<00:57, 17.28it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1238 > 512). Running this sequence through the model will result in indexing errors
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:57<00:00, 17.28it/s]

              precision    recall  f1-score   support

         neg       0.73      0.67      0.70       527
         pos       0.66      0.72      0.69       473

    accuracy                           0.69      1000
   macro avg       0.69      0.69      0.69      1000
weighted avg       0.70      0.69      0.69      1000






In [5]:
np.sum(Y_gold == "pos") / len(Y_gold)

0.473

In [6]:
Y_guess = []
Y_gold = test_df.target_text.array
for src_txt in tqdm(test_df.source_text.array):
    Y_guess.append(model.predict(src_txt)[0])

print(classification_report(Y_gold, Y_guess, target_names=['neg', 'pos']))

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1241/1241 [01:12<00:00, 17.19it/s]


              precision    recall  f1-score   support

         neg       0.69      0.64      0.66       615
         pos       0.67      0.71      0.69       626

    accuracy                           0.68      1241
   macro avg       0.68      0.68      0.68      1241
weighted avg       0.68      0.68      0.68      1241

