In [1]:
import pandas as pd
gbig_test_df = pd.read_csv("big_vul_with_event_trace.csv")

gbig_test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1170 entries, 0 to 1169
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   instruction  1170 non-null   object
 1   input        1170 non-null   object
 2   output       1170 non-null   int64 
 3   graph_path   1170 non-null   object
 4   event_trace  1170 non-null   object
dtypes: int64(1), object(4)
memory usage: 45.8+ KB


In [7]:
import numpy as np
import torch
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer
)
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, matthews_corrcoef,
    cohen_kappa_score, mean_squared_error,
    mean_absolute_error, roc_auc_score
)

# ====== LOAD SAVED MODEL ======
saved_dir = './cdl_codebert_model_final'
model = AutoModelForSequenceClassification.from_pretrained(saved_dir)
tokenizer = AutoTokenizer.from_pretrained(saved_dir)
data_collator = DataCollatorWithPadding(tokenizer)

# ====== LOAD AND PROCESS NEW TEST DATA ======
# Replace with your actual new test DataFrame


# Standardize column names and drop missing values
gbig_test_df = gbig_test_df.rename(columns={"input": "func", "output": "label"})
gbig_test_df = gbig_test_df.dropna(subset=["func", "label"])

SEP = " // LOGIC: "
gbig_test_df['model_input'] =gbig_test_df['func'] + SEP +  gbig_test_df['event_trace']
# Convert to HuggingFace Dataset
gbig_test_ds = Dataset.from_pandas(gbig_test_df)

# Tokenization (FUNC only, no FOL)
def tokenize_fn(batch):
    return tokenizer(
        ["FUNC: " + c for c in batch["event_trace"]] ,
        truncation=True, padding='max_length', max_length=256
    )

big_test_tok = gbig_test_ds.map(tokenize_fn, batched=True)
big_test_tok = big_test_tok.rename_column("label", "labels")
big_test_tok.set_format("torch", columns=["input_ids", "attention_mask", "labels"])


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()
    probs = torch.softmax(torch.tensor(logits), dim=1).numpy()[:, 1]
    try:
        auc = roc_auc_score(labels, probs)
    except ValueError:
        auc = float('nan')
    return {
        "accuracy":    accuracy_score(labels, preds),
        "precision":   precision_score(labels, preds, zero_division=0),
        "recall":      recall_score(labels, preds, zero_division=0),
        "specificity": tn / (tn + fp) if (tn + fp) > 0 else 0,
        "fpr":         fp / (fp + tn) if (fp + tn) > 0 else 0,
        "f1":          f1_score(labels, preds, zero_division=0),
        "mcc":         matthews_corrcoef(labels, preds),
        "kappa":       cohen_kappa_score(labels, preds),
        "mse":         mean_squared_error(labels, preds),
        "mae":         mean_absolute_error(labels, preds),
        "auc":         auc
    }


trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

metrics = trainer.evaluate(eval_dataset=big_test_tok)

# ====== PRINT METRICS ======
print("\n=== Evaluation on BigVul Dataset ===")
for key, value in metrics.items():
    if key.startswith("eval_"):
        print(f"{key[5:]:<12}: {value:.4f}")


Map:   0%|          | 0/1170 [00:00<?, ? examples/s]

  trainer = Trainer(



=== Evaluation on BigVul Dataset ===
loss        : 0.7756
model_preparation_time: 0.0040
accuracy    : 0.5111
precision   : 0.5305
recall      : 0.1932
specificity : 0.8291
fpr         : 0.1709
f1          : 0.2832
mcc         : 0.0288
kappa       : 0.0222
mse         : 0.4889
mae         : 0.4889
auc         : 0.5343
runtime     : 11.3541
samples_per_second: 103.0470
steps_per_second: 12.9470


In [8]:
import pandas as pd
gdiv_test_df = pd.read_csv("diverse_with_event_trace_graph.csv")
gdiv_test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1532 entries, 0 to 1531
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   code_snip    1532 non-null   object
 1   output       1532 non-null   int64 
 2   graph_path   1532 non-null   object
 3   event_trace  1532 non-null   object
dtypes: int64(1), object(3)
memory usage: 48.0+ KB


In [9]:
import numpy as np
import torch
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer
)
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, matthews_corrcoef,
    cohen_kappa_score, mean_squared_error,
    mean_absolute_error, roc_auc_score
)

# ====== LOAD SAVED MODEL ======
saved_dir = './cdl_codebert_model_final'
model = AutoModelForSequenceClassification.from_pretrained(saved_dir)
tokenizer = AutoTokenizer.from_pretrained(saved_dir)
data_collator = DataCollatorWithPadding(tokenizer)

# ====== LOAD AND PROCESS NEW TEST DATA ======
# Replace with your actual new test DataFrame


# Standardize column names and drop missing values
gdiv_test_df = gdiv_test_df.rename(columns={"code_snip": "func", "output": "label"})
gdiv_test_df = gdiv_test_df.dropna(subset=["func", "label"])

SEP = " // LOGIC: "
gdiv_test_df['model_input'] =gdiv_test_df['func'] + SEP +  gdiv_test_df['event_trace']
# Convert to HuggingFace Dataset
gdiv_test_ds = Dataset.from_pandas(gdiv_test_df)

# Tokenization (FUNC only, no FOL)
def tokenize_fn(batch):
    return tokenizer(
        ["FUNC: " + c for c in batch["event_trace"]] ,
        truncation=True, padding='max_length', max_length=256
    )

div_test_tok = gdiv_test_ds.map(tokenize_fn, batched=True)
div_test_tok = div_test_tok.rename_column("label", "labels")
div_test_tok.set_format("torch", columns=["input_ids", "attention_mask", "labels"])


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()
    probs = torch.softmax(torch.tensor(logits), dim=1).numpy()[:, 1]
    try:
        auc = roc_auc_score(labels, probs)
    except ValueError:
        auc = float('nan')
    return {
        "accuracy":    accuracy_score(labels, preds),
        "precision":   precision_score(labels, preds, zero_division=0),
        "recall":      recall_score(labels, preds, zero_division=0),
        "specificity": tn / (tn + fp) if (tn + fp) > 0 else 0,
        "fpr":         fp / (fp + tn) if (fp + tn) > 0 else 0,
        "f1":          f1_score(labels, preds, zero_division=0),
        "mcc":         matthews_corrcoef(labels, preds),
        "kappa":       cohen_kappa_score(labels, preds),
        "mse":         mean_squared_error(labels, preds),
        "mae":         mean_absolute_error(labels, preds),
        "auc":         auc
    }


trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

metrics = trainer.evaluate(eval_dataset=div_test_tok)

# ====== PRINT METRICS ======
print("\n=== Evaluation on Diverse Dataset ===")
for key, value in metrics.items():
    if key.startswith("eval_"):
        print(f"{key[5:]:<12}: {value:.4f}")


Map:   0%|          | 0/1532 [00:00<?, ? examples/s]

  trainer = Trainer(



=== Evaluation on Diverse Dataset ===
loss        : 0.8824
model_preparation_time: 0.0070
accuracy    : 0.5313
precision   : 0.5556
recall      : 0.3133
specificity : 0.7493
fpr         : 0.2507
f1          : 0.4007
mcc         : 0.0696
kappa       : 0.0627
mse         : 0.4687
mae         : 0.4687
auc         : 0.5317
runtime     : 15.4585
samples_per_second: 99.1040
steps_per_second: 12.4200


In [10]:
import pandas as pd

# Load the CSV file
dev_test_df = pd.read_csv("devign_with_event_trace.csv")

# Drop the 'func' column
dev_test_df = dev_test_df.drop(columns=['func'])

# Convert the 'target' column to integer (True → 1, False → 0)
dev_test_df['target'] = dev_test_df['target'].astype(int)

# Check the result
dev_test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14031 entries, 0 to 14030
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   func_cleaned  14031 non-null  object
 1   project       14031 non-null  object
 2   target        14031 non-null  int64 
 3   ast_path      14031 non-null  object
 4   event_trace   14031 non-null  object
dtypes: int64(1), object(4)
memory usage: 548.2+ KB


In [12]:
import numpy as np
import torch
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer
)
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, matthews_corrcoef,
    cohen_kappa_score, mean_squared_error,
    mean_absolute_error, roc_auc_score
)


saved_dir = './cdl_codebert_model_final'
model = AutoModelForSequenceClassification.from_pretrained(saved_dir)
tokenizer = AutoTokenizer.from_pretrained(saved_dir)
data_collator = DataCollatorWithPadding(tokenizer)





dev_test_df = dev_test_df.rename(columns={"func_cleaned": "func", "target": "label"})
dev_test_df = dev_test_df.dropna(subset=["func", "label"])


# Convert to HuggingFace Dataset
dev_test_ds = Dataset.from_pandas(dev_test_df)

# Tokenization (FUNC only, no FOL)
def tokenize_fn(batch):
    return tokenizer(
        ["FUNC: " + c for c in batch["event_trace"]] ,
        truncation=True, padding='max_length', max_length=256
    )

devv_test_tok = dev_test_ds.map(tokenize_fn, batched=True)
devv_test_tok = devv_test_tok.rename_column("label", "labels")
devv_test_tok.set_format("torch", columns=["input_ids", "attention_mask", "labels"])


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()
    probs = torch.softmax(torch.tensor(logits), dim=1).numpy()[:, 1]
    try:
        auc = roc_auc_score(labels, probs)
    except ValueError:
        auc = float('nan')
    return {
        "accuracy":    accuracy_score(labels, preds),
        "precision":   precision_score(labels, preds, zero_division=0),
        "recall":      recall_score(labels, preds, zero_division=0),
        "specificity": tn / (tn + fp) if (tn + fp) > 0 else 0,
        "fpr":         fp / (fp + tn) if (fp + tn) > 0 else 0,
        "f1":          f1_score(labels, preds, zero_division=0),
        "mcc":         matthews_corrcoef(labels, preds),
        "kappa":       cohen_kappa_score(labels, preds),
        "mse":         mean_squared_error(labels, preds),
        "mae":         mean_absolute_error(labels, preds),
        "auc":         auc
    }


trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

metrics = trainer.evaluate(eval_dataset=devv_test_tok)

# ====== PRINT METRICS ======
print("\n=== Evaluation on Devign Dataset ===")
for key, value in metrics.items():
    if key.startswith("eval_"):
        print(f"{key[5:]:<12}: {value:.4f}")


Map:   0%|          | 0/14031 [00:00<?, ? examples/s]

  trainer = Trainer(



=== Evaluation on Devign Dataset ===
loss        : 0.6864
model_preparation_time: 0.0000
accuracy    : 0.5731
precision   : 0.5134
recall      : 0.0223
specificity : 0.9842
fpr         : 0.0158
f1          : 0.0428
mcc         : 0.0239
kappa       : 0.0074
mse         : 0.4269
mae         : 0.4269
auc         : 0.5127
runtime     : 134.1288
samples_per_second: 104.6080
steps_per_second: 13.0770


In [14]:
import pandas as pd
juliet_test_df = pd.read_csv("djuliet_with_event_trace.csv")
juliet_test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3152 entries, 0 to 3151
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   code_snip    3152 non-null   object
 1   output       3152 non-null   int64 
 2   graph_path   3152 non-null   object
 3   event_trace  3152 non-null   object
dtypes: int64(1), object(3)
memory usage: 98.6+ KB


In [15]:
import numpy as np
import torch
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer
)
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, matthews_corrcoef,
    cohen_kappa_score, mean_squared_error,
    mean_absolute_error, roc_auc_score
)

# ====== LOAD SAVED MODEL ======
saved_dir = './cdl_codebert_model_final'
model = AutoModelForSequenceClassification.from_pretrained(saved_dir)
tokenizer = AutoTokenizer.from_pretrained(saved_dir)
data_collator = DataCollatorWithPadding(tokenizer)

# ====== LOAD AND PROCESS NEW TEST DATA ======
# Replace with your actual new test DataFrame


# Standardize column names and drop missing values
juliet_test_df = juliet_test_df.rename(columns={"code_snip": "func", "output": "label"})
juliet_test_df = juliet_test_df.dropna(subset=["func", "label"])

SEP = " // LOGIC: "
juliet_test_df['model_input'] =juliet_test_df['func'] + SEP +  juliet_test_df['event_trace']
# Convert to HuggingFace Dataset
juliet_test_ds = Dataset.from_pandas(juliet_test_df)

# Tokenization (FUNC only, no FOL)
def tokenize_fn(batch):
    return tokenizer(
        ["FUNC: " + c for c in batch["event_trace"]] ,
        truncation=True, padding='max_length', max_length=256
    )

juliet_test_tok = juliet_test_ds.map(tokenize_fn, batched=True)
juliet_test_tok = juliet_test_tok.rename_column("label", "labels")
juliet_test_tok.set_format("torch", columns=["input_ids", "attention_mask", "labels"])


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()
    probs = torch.softmax(torch.tensor(logits), dim=1).numpy()[:, 1]
    try:
        auc = roc_auc_score(labels, probs)
    except ValueError:
        auc = float('nan')
    return {
        "accuracy":    accuracy_score(labels, preds),
        "precision":   precision_score(labels, preds, zero_division=0),
        "recall":      recall_score(labels, preds, zero_division=0),
        "specificity": tn / (tn + fp) if (tn + fp) > 0 else 0,
        "fpr":         fp / (fp + tn) if (fp + tn) > 0 else 0,
        "f1":          f1_score(labels, preds, zero_division=0),
        "mcc":         matthews_corrcoef(labels, preds),
        "kappa":       cohen_kappa_score(labels, preds),
        "mse":         mean_squared_error(labels, preds),
        "mae":         mean_absolute_error(labels, preds),
        "auc":         auc
    }


trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

metrics = trainer.evaluate(eval_dataset=juliet_test_tok)

# ====== PRINT METRICS ======
print("\n=== Evaluation on juliet Dataset ===")
for key, value in metrics.items():
    if key.startswith("eval_"):
        print(f"{key[5:]:<12}: {value:.4f}")


Map:   0%|          | 0/3152 [00:00<?, ? examples/s]

  trainer = Trainer(



=== Evaluation on juliet Dataset ===
loss        : 0.9331
model_preparation_time: 0.0154
accuracy    : 0.5305
precision   : 0.5625
recall      : 0.2741
specificity : 0.7868
fpr         : 0.2132
f1          : 0.3686
mcc         : 0.0709
kappa       : 0.0609
mse         : 0.4695
mae         : 0.4695
auc         : 0.5227
runtime     : 32.7777
samples_per_second: 96.1630
steps_per_second: 12.0200


In [16]:
import pandas as pd
reveal_test_df = pd.read_csv("reveal_with_event_trace.csv")
reveal_test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2028 entries, 0 to 2027
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   instruction  2028 non-null   object
 1   input        2028 non-null   object
 2   output       2028 non-null   int64 
 3   graph_path   2028 non-null   object
 4   event_trace  2028 non-null   object
dtypes: int64(1), object(4)
memory usage: 79.3+ KB


In [17]:
import numpy as np
import torch
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer
)
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, matthews_corrcoef,
    cohen_kappa_score, mean_squared_error,
    mean_absolute_error, roc_auc_score
)

# ====== LOAD SAVED MODEL ======
saved_dir = './cdl_codebert_model_final'
model = AutoModelForSequenceClassification.from_pretrained(saved_dir)
tokenizer = AutoTokenizer.from_pretrained(saved_dir)
data_collator = DataCollatorWithPadding(tokenizer)

# ====== LOAD AND PROCESS NEW TEST DATA ======
# Replace with your actual new test DataFrame


# Standardize column names and drop missing values
reveal_test_df = reveal_test_df.rename(columns={"input": "func", "output": "label"})
reveal_test_df = reveal_test_df.dropna(subset=["func", "label"])

SEP = " // LOGIC: "
reveal_test_df['model_input'] =reveal_test_df['func'] + SEP +  reveal_test_df['event_trace']
# Convert to HuggingFace Dataset
reveal_test_ds = Dataset.from_pandas(reveal_test_df)

# Tokenization (FUNC only, no FOL)
def tokenize_fn(batch):
    return tokenizer(
        ["FUNC: " + c for c in batch["event_trace"]] ,
        truncation=True, padding='max_length', max_length=256
    )

reveal_test_tok = reveal_test_ds.map(tokenize_fn, batched=True)
reveal_test_tok = reveal_test_tok.rename_column("label", "labels")
reveal_test_tok.set_format("torch", columns=["input_ids", "attention_mask", "labels"])


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()
    probs = torch.softmax(torch.tensor(logits), dim=1).numpy()[:, 1]
    try:
        auc = roc_auc_score(labels, probs)
    except ValueError:
        auc = float('nan')
    return {
        "accuracy":    accuracy_score(labels, preds),
        "precision":   precision_score(labels, preds, zero_division=0),
        "recall":      recall_score(labels, preds, zero_division=0),
        "specificity": tn / (tn + fp) if (tn + fp) > 0 else 0,
        "fpr":         fp / (fp + tn) if (fp + tn) > 0 else 0,
        "f1":          f1_score(labels, preds, zero_division=0),
        "mcc":         matthews_corrcoef(labels, preds),
        "kappa":       cohen_kappa_score(labels, preds),
        "mse":         mean_squared_error(labels, preds),
        "mae":         mean_absolute_error(labels, preds),
        "auc":         auc
    }


trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

metrics = trainer.evaluate(eval_dataset=reveal_test_tok)

# ====== PRINT METRICS ======
print("\n=== Evaluation on RVl Dataset ===")
for key, value in metrics.items():
    if key.startswith("eval_"):
        print(f"{key[5:]:<12}: {value:.4f}")


Map:   0%|          | 0/2028 [00:00<?, ? examples/s]

  trainer = Trainer(



=== Evaluation on RVl Dataset ===
loss        : 0.9377
model_preparation_time: 0.0081
accuracy    : 0.4921
precision   : 0.4839
recall      : 0.2367
specificity : 0.7475
fpr         : 0.2525
f1          : 0.3179
mcc         : -0.0184
kappa       : -0.0158
mse         : 0.5079
mae         : 0.5079
auc         : 0.5107
runtime     : 19.7643
samples_per_second: 102.6090
steps_per_second: 12.8510


In [18]:
import pandas as pd
cv_test_df = pd.read_csv("cvefixes_with_event_trace.csv")
cv_test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4216 entries, 0 to 4215
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   instruction  4216 non-null   object
 1   input        4216 non-null   object
 2   output       4216 non-null   int64 
 3   idx          4216 non-null   int64 
 4   graph_path   4216 non-null   object
 5   event_trace  4216 non-null   object
dtypes: int64(2), object(4)
memory usage: 197.8+ KB


In [19]:
import numpy as np
import torch
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer
)
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, matthews_corrcoef,
    cohen_kappa_score, mean_squared_error,
    mean_absolute_error, roc_auc_score
)

# ====== LOAD SAVED MODEL ======
saved_dir = './cdl_codebert_model_final'
model = AutoModelForSequenceClassification.from_pretrained(saved_dir)
tokenizer = AutoTokenizer.from_pretrained(saved_dir)
data_collator = DataCollatorWithPadding(tokenizer)

# ====== LOAD AND PROCESS NEW TEST DATA ======
# Replace with your actual new test DataFrame


# Standardize column names and drop missing values
cv_test_df = cv_test_df.rename(columns={"input": "func", "output": "label"})
cv_test_df = cv_test_df.dropna(subset=["func", "label"])

SEP = " // LOGIC: "
cv_test_df['model_input'] =cv_test_df['func'] + SEP +  cv_test_df['event_trace']
# Convert to HuggingFace Dataset
cv_test_ds = Dataset.from_pandas(cv_test_df)

# Tokenization (FUNC only, no FOL)
def tokenize_fn(batch):
    return tokenizer(
        ["FUNC: " + c for c in batch["event_trace"]] ,
        truncation=True, padding='max_length', max_length=256
    )

cv_test_tok = cv_test_ds.map(tokenize_fn, batched=True)
cv_test_tok = cv_test_tok.rename_column("label", "labels")
cv_test_tok.set_format("torch", columns=["input_ids", "attention_mask", "labels"])


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()
    probs = torch.softmax(torch.tensor(logits), dim=1).numpy()[:, 1]
    try:
        auc = roc_auc_score(labels, probs)
    except ValueError:
        auc = float('nan')
    return {
        "accuracy":    accuracy_score(labels, preds),
        "precision":   precision_score(labels, preds, zero_division=0),
        "recall":      recall_score(labels, preds, zero_division=0),
        "specificity": tn / (tn + fp) if (tn + fp) > 0 else 0,
        "fpr":         fp / (fp + tn) if (fp + tn) > 0 else 0,
        "f1":          f1_score(labels, preds, zero_division=0),
        "mcc":         matthews_corrcoef(labels, preds),
        "kappa":       cohen_kappa_score(labels, preds),
        "mse":         mean_squared_error(labels, preds),
        "mae":         mean_absolute_error(labels, preds),
        "auc":         auc
    }


trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

metrics = trainer.evaluate(eval_dataset=cv_test_tok)

# ====== PRINT METRICS ======
print("\n=== Evaluation on RVl Dataset ===")
for key, value in metrics.items():
    if key.startswith("eval_"):
        print(f"{key[5:]:<12}: {value:.4f}")


Map:   0%|          | 0/4216 [00:00<?, ? examples/s]

  trainer = Trainer(



=== Evaluation on RVl Dataset ===
loss        : 0.9430
model_preparation_time: 0.0060
accuracy    : 0.5047
precision   : 0.5062
recall      : 0.3857
specificity : 0.6238
fpr         : 0.3762
f1          : 0.4378
mcc         : 0.0098
kappa       : 0.0095
mse         : 0.4953
mae         : 0.4953
auc         : 0.5039
runtime     : 41.1136
samples_per_second: 102.5450
steps_per_second: 12.8180


In [20]:
import pandas as pd
CV_test_df = pd.read_csv("mixvul_with_event_trace.csv")
CV_test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2864 entries, 0 to 2863
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   instruction  2864 non-null   object
 1   input        2864 non-null   object
 2   output       2864 non-null   int64 
 3   graph_path   2864 non-null   object
 4   event_trace  2864 non-null   object
dtypes: int64(1), object(4)
memory usage: 112.0+ KB


In [21]:
import numpy as np
import torch
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer
)
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, matthews_corrcoef,
    cohen_kappa_score, mean_squared_error,
    mean_absolute_error, roc_auc_score
)

# ====== LOAD SAVED MODEL ======
saved_dir = './cdl_codebert_model_final'
model = AutoModelForSequenceClassification.from_pretrained(saved_dir)
tokenizer = AutoTokenizer.from_pretrained(saved_dir)
data_collator = DataCollatorWithPadding(tokenizer)

# ====== LOAD AND PROCESS NEW TEST DATA ======
# Replace with your actual new test DataFrame


# Standardize column names and drop missing values
CV_test_df = CV_test_df.rename(columns={"input": "func", "output": "label"})
CV_test_df = CV_test_df.dropna(subset=["func", "label"])

SEP = " // LOGIC: "
CV_test_df['model_input'] =CV_test_df['func'] + SEP +  CV_test_df['event_trace']
# Convert to HuggingFace Dataset
CV_test_ds = Dataset.from_pandas(CV_test_df)

# Tokenization (FUNC only, no FOL)
def tokenize_fn(batch):
    return tokenizer(
        ["FUNC: " + c for c in batch["event_trace"]] ,
        truncation=True, padding='max_length', max_length=256
    )

CV_test_tok = CV_test_ds.map(tokenize_fn, batched=True)
CV_test_tok = CV_test_tok.rename_column("label", "labels")
CV_test_tok.set_format("torch", columns=["input_ids", "attention_mask", "labels"])


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()
    probs = torch.softmax(torch.tensor(logits), dim=1).numpy()[:, 1]
    try:
        auc = roc_auc_score(labels, probs)
    except ValueError:
        auc = float('nan')
    return {
        "accuracy":    accuracy_score(labels, preds),
        "precision":   precision_score(labels, preds, zero_division=0),
        "recall":      recall_score(labels, preds, zero_division=0),
        "specificity": tn / (tn + fp) if (tn + fp) > 0 else 0,
        "fpr":         fp / (fp + tn) if (fp + tn) > 0 else 0,
        "f1":          f1_score(labels, preds, zero_division=0),
        "mcc":         matthews_corrcoef(labels, preds),
        "kappa":       cohen_kappa_score(labels, preds),
        "mse":         mean_squared_error(labels, preds),
        "mae":         mean_absolute_error(labels, preds),
        "auc":         auc
    }


trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

metrics = trainer.evaluate(eval_dataset=CV_test_tok)

# ====== PRINT METRICS ======
print("\n=== Evaluation on RVl Dataset ===")
for key, value in metrics.items():
    if key.startswith("eval_"):
        print(f"{key[5:]:<12}: {value:.4f}")


Map:   0%|          | 0/2864 [00:00<?, ? examples/s]

  trainer = Trainer(



=== Evaluation on RVl Dataset ===
loss        : 0.8386
model_preparation_time: 0.0020
accuracy    : 0.5468
precision   : 0.5821
recall      : 0.3317
specificity : 0.7619
fpr         : 0.2381
f1          : 0.4226
mcc         : 0.1037
kappa       : 0.0936
mse         : 0.4532
mae         : 0.4532
auc         : 0.5799
runtime     : 28.6846
samples_per_second: 99.8440
steps_per_second: 12.4810
