This evaluates the last command prediction model trained on both V1 and V2 of the data, around different combinations.




In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import torch
from pathlib import Path
from tqdm import tqdm
from sklearn.metrics import classification_report

from exp_setup import *
from coprover.results_analysis import GuessGoldTelemetry
from coprover.training.simplet5 import SimpleT5

MODELS_DIR = Path("outputs")
RESULTS_DIR = Path("results", "t5", "v1")

DEVICE = "cuda:1"

# Model listing, tuple (strip_cmdhistory, cmds_only, path)
MODELS = {
    "t5_full_v1": (False, False, Path(MODELS_DIR, "laststep_pred_v1", "best_model")),
    "t5_cmdsonly_v1": (False, True, Path(MODELS_DIR, "laststep_pred_cmdsonly_v1", "best_model")),
    "t5_nocmds_v1": (True, False, Path(MODELS_DIR, "laststep_pred_nocmds_v1", "best_model"))
}


Global seed set to 1337


In [6]:
def run_experiment(exp_name, strip_cmdhistory, cmds_only, model_fpath):
    print(f"Running experiment: {exp_name}, model_fpath={model_fpath}, strip_cmdhistory={strip_cmdhistory}, cmds_only={cmds_only}")
    model = SimpleT5(source_max_token_len=SRC_MAX_TOKLEN, target_max_token_len=TGT_MAX_TOKLEN)
    model.load_model(model_fpath, use_gpu=True, use_device=DEVICE)
    train_df, test_df = setup_laststep_pred_data(strip_cmdhistory=strip_cmdhistory, cmds_only=cmds_only)
    print(f"Len train={len(train_df)}, test={len(test_df)}")
    print(f"Train, # pos={np.sum(train_df.target_text == POS)}, neg={np.sum(train_df.target_text == NEG)}")
    print(f"Test, # pos={np.sum(test_df.target_text == POS)}, neg={np.sum(test_df.target_text == NEG)}")
    
    # Eval Test
    test_Y_guess = []
    test_Y_gold = test_df.target_text.array
    for src_txt in tqdm(test_df.source_text.array):
        test_Y_guess.append(model.predict(src_txt)[0])
    test_telem = GuessGoldTelemetry(guesses=test_Y_guess, golds=test_Y_gold, target_names=[NEG, POS], 
                                    name="{exp_name} Test")
    RESULTS_DIR.mkdir(exist_ok=True, parents=True)
    test_telem.save(Path(RESULTS_DIR, f"{exp_name}.csv"))
    print("Test Result")
    print(test_telem.class_report(return_dict=False))
    # Subsample train
    idx = 1000
    train_Y_guess = []
    train_Y_gold = train_df.target_text.array[0:idx]
    for src_txt in tqdm(train_df.source_text.array[0:idx]):
        train_Y_guess.append(model.predict(src_txt)[0])
    train_telem = GuessGoldTelemetry(guesses=train_Y_guess, golds=train_Y_gold, target_names=[NEG, POS], 
                                     name=f"{exp_name} Train(0:{idx})")
    print("Train Result")
    print(train_telem.class_report(return_dict=False))
    return test_telem, train_telem
    

In [7]:
for exp_name, exp_tuple in MODELS.items():
    strip_cmdhistory, cmds_only, model_fpath = exp_tuple
    test_telem, train_telem = run_experiment(exp_name, strip_cmdhistory, cmds_only, model_fpath)

Running experiment: t5_full_v1, model_fpath=outputs/laststep_pred_v1/best_model, strip_cmdhistory=False, cmds_only=False
Loading model_type=t5, dir=outputs/laststep_pred_v1/best_model, use_gpu=True
Len train=11162, test=1241
Len train=11162, test=1241
Train, # pos=5578, neg=5584
Test, # pos=626, neg=615


  0%|          | 0/1241 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (529 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 1241/1241 [01:12<00:00, 17.10it/s]


Test Result
              precision    recall  f1-score   support

         neg       0.82      0.56      0.67       615
         pos       0.67      0.88      0.76       626

    accuracy                           0.72      1241
   macro avg       0.75      0.72      0.71      1241
weighted avg       0.75      0.72      0.71      1241



100%|██████████| 1000/1000 [00:58<00:00, 17.18it/s]


Train Result
              precision    recall  f1-score   support

         neg       0.90      0.62      0.73       527
         pos       0.69      0.92      0.79       473

    accuracy                           0.76      1000
   macro avg       0.79      0.77      0.76      1000
weighted avg       0.80      0.76      0.76      1000

Running experiment: t5_cmdsonly_v1, model_fpath=outputs/laststep_pred_cmdsonly_v1/best_model, strip_cmdhistory=False, cmds_only=True
Loading model_type=t5, dir=outputs/laststep_pred_cmdsonly_v1/best_model, use_gpu=True
Len train=11162, test=1241
Len train=11162, test=1241
Train, # pos=5578, neg=5584
Test, # pos=626, neg=615


100%|██████████| 1241/1241 [00:54<00:00, 22.85it/s]


Test Result
              precision    recall  f1-score   support

         neg       0.74      0.49      0.59       615
         pos       0.62      0.83      0.71       626

    accuracy                           0.66      1241
   macro avg       0.68      0.66      0.65      1241
weighted avg       0.68      0.66      0.65      1241



100%|██████████| 1000/1000 [00:43<00:00, 22.87it/s]


Train Result
              precision    recall  f1-score   support

         neg       0.83      0.53      0.65       527
         pos       0.63      0.88      0.73       473

    accuracy                           0.69      1000
   macro avg       0.73      0.70      0.69      1000
weighted avg       0.73      0.69      0.69      1000

Running experiment: t5_nocmds_v1, model_fpath=outputs/laststep_pred_nocmds_v1/best_model, strip_cmdhistory=True, cmds_only=False
Loading model_type=t5, dir=outputs/laststep_pred_nocmds_v1/best_model, use_gpu=True
Len train=11162, test=1241
Len train=11162, test=1241
Train, # pos=5578, neg=5584
Test, # pos=626, neg=615


  0%|          | 0/1241 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (523 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 1241/1241 [01:12<00:00, 17.10it/s]


Test Result
              precision    recall  f1-score   support

         neg       0.69      0.64      0.66       615
         pos       0.67      0.71      0.69       626

    accuracy                           0.68      1241
   macro avg       0.68      0.68      0.68      1241
weighted avg       0.68      0.68      0.68      1241



100%|██████████| 1000/1000 [00:58<00:00, 17.16it/s]

Train Result
              precision    recall  f1-score   support

         neg       0.73      0.67      0.70       527
         pos       0.66      0.72      0.69       473

    accuracy                           0.69      1000
   macro avg       0.69      0.69      0.69      1000
weighted avg       0.70      0.69      0.69      1000




