In [1]:
import pandas as pd
train_df = pd.read_csv('train_fol.csv')
test_df = pd.read_csv('test_fol.csv')

devign with fol

In [14]:
import numpy as np
import torch
import pandas as pd
import time
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, matthews_corrcoef,
    cohen_kappa_score, mean_squared_error,
    mean_absolute_error, roc_auc_score
)

# ====== STEP 1: Data Preprocessing ======
train_df = train_df.rename(columns={
    "func_cleaned": "func",
    "target": "label",
    "fol_logic": "fol"
})
test_df = test_df.rename(columns={
    "func_cleaned": "func",
    "target": "label"
})

train_df = train_df.dropna(subset=["fol", "func", "label"])
test_df = test_df.dropna(subset=["func", "label"])

train_df["label"] = train_df["label"].astype(int)
test_df["label"] = test_df["label"].astype(int)

train_ds = Dataset.from_pandas(train_df)
test_ds  = Dataset.from_pandas(test_df)

# ====== STEP 2: Tokenizer Setup ======
model_name = "microsoft/unixcoder-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Train: use FOL + FUNC
def tokenize_fn_train(batch):
    return tokenizer(
        ["FOL: " + f + " FUNC: " + c for f, c in zip(batch["fol"], batch["func"])],
        truncation=True, padding='max_length', max_length=256
    )

# Test: use FUNC only
def tokenize_fn_test(batch):
    return tokenizer(
        ["FUNC: " + c for c in batch["func"]],
        truncation=True, padding='max_length', max_length=256
    )

train_tok = train_ds.map(tokenize_fn_train, batched=True)
test_tok  = test_ds.map(tokenize_fn_test, batched=True)

# ====== STEP 3: Prepare datasets ======
train_tok = train_tok.rename_column('label', 'labels')
test_tok  = test_tok.rename_column('label', 'labels')
train_tok.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
test_tok.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

# ====== STEP 4: Load UnixCoder Model ======
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
model.config.problem_type = "single_label_classification"
model.config.id2label = {0: "safe", 1: "vuln"}
model.config.label2id = {"safe": 0, "vuln": 1}

# ====== STEP 5: Metrics ======
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()
    probs = torch.softmax(torch.tensor(logits), dim=1).numpy()[:, 1]
    try:
        auc = roc_auc_score(labels, probs)
    except ValueError:
        auc = float('nan')
    return {
        "accuracy":    accuracy_score(labels, preds),
        "precision":   precision_score(labels, preds, zero_division=0),
        "recall":      recall_score(labels, preds, zero_division=0),
        "specificity": tn / (tn + fp) if (tn + fp) > 0 else 0,
        "fpr":         fp / (fp + tn) if (fp + tn) > 0 else 0,
        "f1":          f1_score(labels, preds, zero_division=0),
        "mcc":         matthews_corrcoef(labels, preds),
        "kappa":       cohen_kappa_score(labels, preds),
        "mse":         mean_squared_error(labels, preds),
        "mae":         mean_absolute_error(labels, preds),
        "auc":         auc
    }

# ====== STEP 6: Training Setup ======
data_collator = DataCollatorWithPadding(tokenizer)
args = TrainingArguments(
    output_dir='./unixcoder_trainFOL_testFUNC',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    logging_steps=50
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_tok,
    eval_dataset=test_tok,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]
)

# ====== STEP 7: Train ======
start_train = time.time()
trainer.train()
end_train = time.time()
train_runtime = end_train - start_train
print(f"\n=== Training completed in {train_runtime:.2f} seconds ({train_runtime / 60:.2f} minutes) ===")

# ====== STEP 8: Evaluate ======
start_eval = time.time()
metrics = trainer.evaluate()
end_eval = time.time()
eval_runtime = end_eval - start_eval
print(f"\n=== Evaluation completed in {eval_runtime:.2f} seconds ({eval_runtime / 60:.2f} minutes) ===")

print("\n=== Hold‐out Metrics (Trained with FOL + FUNC, Tested with FUNC only) ===")
for key, value in metrics.items():
    if key.startswith("eval_"):
        print(f"{key[5:]:<12}: {value:.4f}")

print(f"Train Time (s): {train_runtime:.2f}")
print(f"Eval Time (s):  {eval_runtime:.2f}")

# ====== STEP 9: Save Model ======
save_dir = './unixcoder_trainFOL_testFUNC_saved'
trainer.save_model(save_dir)
tokenizer.save_pretrained(save_dir)
print(f"\nModel and tokenizer saved to {save_dir}")


Map: 100%|██████████| 7952/7952 [00:05<00:00, 1545.13 examples/s]
Map: 100%|██████████| 2807/2807 [00:01<00:00, 1977.45 examples/s]
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/unixcoder-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,Specificity,Fpr,F1,Mcc,Kappa,Mse,Mae,Auc
1,0.6477,0.668194,0.592091,0.539054,0.402961,0.736644,0.263356,0.461176,0.147833,0.143885,0.407909,0.407909,0.597486
2,0.661,0.651721,0.59031,0.520049,0.703947,0.503457,0.496543,0.598183,0.208684,0.199107,0.40969,0.40969,0.661306
3,0.5195,0.709579,0.620235,0.561375,0.564145,0.663105,0.336895,0.562756,0.22712,0.227118,0.379765,0.379765,0.678595


Checkpoint destination directory ./unixcoder_trainFOL_testFUNC\checkpoint-994 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./unixcoder_trainFOL_testFUNC\checkpoint-1988 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./unixcoder_trainFOL_testFUNC\checkpoint-2982 already exists and is non-empty. Saving will proceed but saved results may be invalid.



=== Training completed in 908.53 seconds (15.14 minutes) ===



=== Evaluation completed in 30.50 seconds (0.51 minutes) ===

=== Hold‐out Metrics (Trained with FOL + FUNC, Tested with FUNC only) ===
loss        : 0.6517
accuracy    : 0.5903
precision   : 0.5200
recall      : 0.7039
specificity : 0.5035
fpr         : 0.4965
f1          : 0.5982
mcc         : 0.2087
kappa       : 0.1991
mse         : 0.4097
mae         : 0.4097
auc         : 0.6613
runtime     : 30.4954
samples_per_second: 92.0470
steps_per_second: 11.5100
Train Time (s): 908.53
Eval Time (s):  30.50

Model and tokenizer saved to ./unixcoder_trainFOL_testFUNC_saved


In [9]:
import pandas as pd
devign_test_df = pd.read_csv("test_fol.csv")
devign_test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2807 entries, 0 to 2806
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   func_cleaned   2807 non-null   object
 1   target         2807 non-null   bool  
 2   fol_logic      1934 non-null   object
 3   combined_code  2807 non-null   object
dtypes: bool(1), object(3)
memory usage: 68.7+ KB


In [15]:
import numpy as np
import torch
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer
)
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, matthews_corrcoef,
    cohen_kappa_score, mean_squared_error,
    mean_absolute_error, roc_auc_score
)

# ====== LOAD SAVED MODEL ======
saved_dir = './unixcoder_trainFOL_testFUNC_saved'
model = AutoModelForSequenceClassification.from_pretrained(saved_dir)
tokenizer = AutoTokenizer.from_pretrained(saved_dir)
data_collator = DataCollatorWithPadding(tokenizer)

# ====== LOAD AND PROCESS NEW TEST DATA ======
# Replace with your actual new test DataFrame


# Standardize column names and drop missing values
devign_test_df = devign_test_df.rename(columns={"func_cleaned": "func", "target": "label"})
devign_test_df = devign_test_df.dropna(subset=["func", "label"])
devign_test_df["label"] = devign_test_df["label"].astype(int)

# Convert to HuggingFace Dataset
devign_test_ds = Dataset.from_pandas(devign_test_df)

# Tokenization (FUNC only, no FOL)
def tokenize_fn(batch):
    return tokenizer(
        ["FUNC: " + c for c in batch["func"]],
        truncation=True, padding='max_length', max_length=256
    )

devign_test_tok = devign_test_ds.map(tokenize_fn, batched=True)
devign_test_tok = devign_test_tok.rename_column("label", "labels")
devign_test_tok.set_format("torch", columns=["input_ids", "attention_mask", "labels"])


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()
    probs = torch.softmax(torch.tensor(logits), dim=1).numpy()[:, 1]
    try:
        auc = roc_auc_score(labels, probs)
    except ValueError:
        auc = float('nan')
    return {
        "accuracy":    accuracy_score(labels, preds),
        "precision":   precision_score(labels, preds, zero_division=0),
        "recall":      recall_score(labels, preds, zero_division=0),
        "specificity": tn / (tn + fp) if (tn + fp) > 0 else 0,
        "fpr":         fp / (fp + tn) if (fp + tn) > 0 else 0,
        "f1":          f1_score(labels, preds, zero_division=0),
        "mcc":         matthews_corrcoef(labels, preds),
        "kappa":       cohen_kappa_score(labels, preds),
        "mse":         mean_squared_error(labels, preds),
        "mae":         mean_absolute_error(labels, preds),
        "auc":         auc
    }


trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

metrics = trainer.evaluate(eval_dataset=devign_test_tok)

# ====== PRINT METRICS ======
print("\n=== Evaluation on devign Dataset ===")
for key, value in metrics.items():
    if key.startswith("eval_"):
        print(f"{key[5:]:<12}: {value:.4f}")


Map: 100%|██████████| 2807/2807 [00:01<00:00, 1637.17 examples/s]
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)



=== Evaluation on devign Dataset ===
loss        : 0.6517
accuracy    : 0.5903
precision   : 0.5200
recall      : 0.7039
specificity : 0.5035
fpr         : 0.4965
f1          : 0.5982
mcc         : 0.2087
kappa       : 0.1991
mse         : 0.4097
mae         : 0.4097
auc         : 0.6613
runtime     : 31.4705
samples_per_second: 89.1950
steps_per_second: 11.1530


bigvul with FOL

In [16]:
import pandas as pd
gbig_test_df = pd.read_csv("big_vultest.csv")
gbig_test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1170 entries, 0 to 1169
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   instruction  1170 non-null   object
 1   input        1170 non-null   object
 2   output       1170 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 27.5+ KB


In [17]:
import numpy as np
import torch
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer
)
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, matthews_corrcoef,
    cohen_kappa_score, mean_squared_error,
    mean_absolute_error, roc_auc_score
)

# ====== LOAD SAVED MODEL ======
saved_dir = './unixcoder_trainFOL_testFUNC_saved'
model = AutoModelForSequenceClassification.from_pretrained(saved_dir)
tokenizer = AutoTokenizer.from_pretrained(saved_dir)
data_collator = DataCollatorWithPadding(tokenizer)

# ====== LOAD AND PROCESS NEW TEST DATA ======
# Replace with your actual new test DataFrame


# Standardize column names and drop missing values
gbig_test_df = gbig_test_df.rename(columns={"input": "func", "output": "label"})
gbig_test_df = gbig_test_df.dropna(subset=["func", "label"])


# Convert to HuggingFace Dataset
gbig_test_ds = Dataset.from_pandas(gbig_test_df)

# Tokenization (FUNC only, no FOL)
def tokenize_fn(batch):
    return tokenizer(
        ["FUNC: " + c for c in batch["func"]],
        truncation=True, padding='max_length', max_length=256
    )

big_test_tok = gbig_test_ds.map(tokenize_fn, batched=True)
big_test_tok = big_test_tok.rename_column("label", "labels")
big_test_tok.set_format("torch", columns=["input_ids", "attention_mask", "labels"])


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()
    probs = torch.softmax(torch.tensor(logits), dim=1).numpy()[:, 1]
    try:
        auc = roc_auc_score(labels, probs)
    except ValueError:
        auc = float('nan')
    return {
        "accuracy":    accuracy_score(labels, preds),
        "precision":   precision_score(labels, preds, zero_division=0),
        "recall":      recall_score(labels, preds, zero_division=0),
        "specificity": tn / (tn + fp) if (tn + fp) > 0 else 0,
        "fpr":         fp / (fp + tn) if (fp + tn) > 0 else 0,
        "f1":          f1_score(labels, preds, zero_division=0),
        "mcc":         matthews_corrcoef(labels, preds),
        "kappa":       cohen_kappa_score(labels, preds),
        "mse":         mean_squared_error(labels, preds),
        "mae":         mean_absolute_error(labels, preds),
        "auc":         auc
    }


trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

metrics = trainer.evaluate(eval_dataset=big_test_tok)

# ====== PRINT METRICS ======
print("\n=== Evaluation on BigVul Dataset ===")
for key, value in metrics.items():
    if key.startswith("eval_"):
        print(f"{key[5:]:<12}: {value:.4f}")


Map: 100%|██████████| 1170/1170 [00:00<00:00, 3025.06 examples/s]
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)



=== Evaluation on BigVul Dataset ===
loss        : 0.7253
accuracy    : 0.4923
precision   : 0.4934
recall      : 0.5709
specificity : 0.4137
fpr         : 0.5863
f1          : 0.5293
mcc         : -0.0156
kappa       : -0.0154
mse         : 0.5077
mae         : 0.5077
auc         : 0.4936
runtime     : 11.5780
samples_per_second: 101.0540
steps_per_second: 12.6970


Diverse fol

In [18]:
import pandas as pd
gdiv_test_df = pd.read_csv("diverse_test.csv")
gdiv_test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1532 entries, 0 to 1531
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   code_snip  1532 non-null   object
 1   output     1532 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 24.1+ KB


In [19]:
import numpy as np
import torch
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer
)
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, matthews_corrcoef,
    cohen_kappa_score, mean_squared_error,
    mean_absolute_error, roc_auc_score
)

# ====== LOAD SAVED MODEL ======
saved_dir = './unixcoder_trainFOL_testFUNC_saved'
model = AutoModelForSequenceClassification.from_pretrained(saved_dir)
tokenizer = AutoTokenizer.from_pretrained(saved_dir)
data_collator = DataCollatorWithPadding(tokenizer)

# ====== LOAD AND PROCESS NEW TEST DATA ======
# Replace with your actual new test DataFrame


# Standardize column names and drop missing values
gdiv_test_df = gdiv_test_df.rename(columns={"code_snip": "func", "output": "label"})
gdiv_test_df= gdiv_test_df.dropna(subset=["func", "label"])


# Convert to HuggingFace Dataset
gdiv_test_ds = Dataset.from_pandas(gdiv_test_df)

# Tokenization (FUNC only, no FOL)
def tokenize_fn(batch):
    return tokenizer(
        ["FUNC: " + c for c in batch["func"]],
        truncation=True, padding='max_length', max_length=256
    )

div_test_tok = gdiv_test_ds.map(tokenize_fn, batched=True)
div_test_tok = div_test_tok.rename_column("label", "labels")
div_test_tok.set_format("torch", columns=["input_ids", "attention_mask", "labels"])


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()
    probs = torch.softmax(torch.tensor(logits), dim=1).numpy()[:, 1]
    try:
        auc = roc_auc_score(labels, probs)
    except ValueError:
        auc = float('nan')
    return {
        "accuracy":    accuracy_score(labels, preds),
        "precision":   precision_score(labels, preds, zero_division=0),
        "recall":      recall_score(labels, preds, zero_division=0),
        "specificity": tn / (tn + fp) if (tn + fp) > 0 else 0,
        "fpr":         fp / (fp + tn) if (fp + tn) > 0 else 0,
        "f1":          f1_score(labels, preds, zero_division=0),
        "mcc":         matthews_corrcoef(labels, preds),
        "kappa":       cohen_kappa_score(labels, preds),
        "mse":         mean_squared_error(labels, preds),
        "mae":         mean_absolute_error(labels, preds),
        "auc":         auc
    }


trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

metrics = trainer.evaluate(eval_dataset=div_test_tok)

# ====== PRINT METRICS ======
print("\n=== Evaluation on Diverse Dataset ===")
for key, value in metrics.items():
    if key.startswith("eval_"):
        print(f"{key[5:]:<12}: {value:.4f}")


Map: 100%|██████████| 1532/1532 [00:00<00:00, 3487.07 examples/s]
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)



=== Evaluation on Diverse Dataset ===
loss        : 0.7030
accuracy    : 0.5287
precision   : 0.5226
recall      : 0.6632
specificity : 0.3943
fpr         : 0.6057
f1          : 0.5846
mcc         : 0.0596
kappa       : 0.0574
mse         : 0.4713
mae         : 0.4713
auc         : 0.5482
runtime     : 14.2622
samples_per_second: 107.4170
steps_per_second: 13.4620


julirt fol

In [20]:
import pandas as pd
juliet_test_df = pd.read_csv("djuliet_test.csv")
juliet_test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3152 entries, 0 to 3151
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   code_snip  3152 non-null   object
 1   output     3152 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 49.4+ KB


In [21]:
import numpy as np
import torch
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer
)
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, matthews_corrcoef,
    cohen_kappa_score, mean_squared_error,
    mean_absolute_error, roc_auc_score
)

# ====== LOAD SAVED MODEL ======
saved_dir = './unixcoder_trainFOL_testFUNC_saved'
model = AutoModelForSequenceClassification.from_pretrained(saved_dir)
tokenizer = AutoTokenizer.from_pretrained(saved_dir)
data_collator = DataCollatorWithPadding(tokenizer)

# ====== LOAD AND PROCESS NEW TEST DATA ======
# Replace with your actual new test DataFrame


# Standardize column names and drop missing values
juliet_test_df = juliet_test_df.rename(columns={"code_snip": "func", "output": "label"})
juliet_test_df = juliet_test_df.dropna(subset=["func", "label"])


# Convert to HuggingFace Dataset
juliet_test_ds = Dataset.from_pandas(juliet_test_df)

# Tokenization (FUNC only, no FOL)
def tokenize_fn(batch):
    return tokenizer(
        ["FUNC: " + c for c in batch["func"]],
        truncation=True, padding='max_length', max_length=256
    )

juliet_test_tok = juliet_test_ds.map(tokenize_fn, batched=True)
juliet_test_tok = juliet_test_tok.rename_column("label", "labels")
juliet_test_tok.set_format("torch", columns=["input_ids", "attention_mask", "labels"])


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()
    probs = torch.softmax(torch.tensor(logits), dim=1).numpy()[:, 1]
    try:
        auc = roc_auc_score(labels, probs)
    except ValueError:
        auc = float('nan')
    return {
        "accuracy":    accuracy_score(labels, preds),
        "precision":   precision_score(labels, preds, zero_division=0),
        "recall":      recall_score(labels, preds, zero_division=0),
        "specificity": tn / (tn + fp) if (tn + fp) > 0 else 0,
        "fpr":         fp / (fp + tn) if (fp + tn) > 0 else 0,
        "f1":          f1_score(labels, preds, zero_division=0),
        "mcc":         matthews_corrcoef(labels, preds),
        "kappa":       cohen_kappa_score(labels, preds),
        "mse":         mean_squared_error(labels, preds),
        "mae":         mean_absolute_error(labels, preds),
        "auc":         auc
    }


trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

metrics = trainer.evaluate(eval_dataset=juliet_test_tok)

# ====== PRINT METRICS ======
print("\n=== Evaluation on Juliet Dataset ===")
for key, value in metrics.items():
    if key.startswith("eval_"):
        print(f"{key[5:]:<12}: {value:.4f}")


Map: 100%|██████████| 3152/3152 [00:00<00:00, 5093.71 examples/s]
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)



=== Evaluation on Juliet Dataset ===
loss        : 0.6742
accuracy    : 0.5543
precision   : 0.6034
recall      : 0.3166
specificity : 0.7919
fpr         : 0.2081
f1          : 0.4153
mcc         : 0.1233
kappa       : 0.1085
mse         : 0.4457
mae         : 0.4457
auc         : 0.6174
runtime     : 30.2564
samples_per_second: 104.1760
steps_per_second: 13.0220


rvl fol

In [22]:
import pandas as pd
reveal_test_df = pd.read_csv("Reveal_vultest.csv")
reveal_test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2028 entries, 0 to 2027
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   instruction  2028 non-null   object
 1   input        2028 non-null   object
 2   output       2028 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 47.7+ KB


In [23]:
import numpy as np
import torch
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer
)
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, matthews_corrcoef,
    cohen_kappa_score, mean_squared_error,
    mean_absolute_error, roc_auc_score
)

# ====== LOAD SAVED MODEL ======
saved_dir = './unixcoder_trainFOL_testFUNC_saved'
model = AutoModelForSequenceClassification.from_pretrained(saved_dir)
tokenizer = AutoTokenizer.from_pretrained(saved_dir)
data_collator = DataCollatorWithPadding(tokenizer)

# ====== LOAD AND PROCESS NEW TEST DATA ======
# Replace with your actual new test DataFrame
reveal_test_df = pd.read_csv("Reveal_vultest.csv")

# Standardize column names and drop missing values
reveal_test_df = reveal_test_df.rename(columns={"input": "func", "output": "label"})
reveal_test_df = reveal_test_df.dropna(subset=["func", "label"])


# Convert to HuggingFace Dataset
reveal_test_ds = Dataset.from_pandas(reveal_test_df)

# Tokenization (FUNC only, no FOL)
def tokenize_fn(batch):
    return tokenizer(
        ["FUNC: " + c for c in batch["func"]],
        truncation=True, padding='max_length', max_length=256
    )

reveal_test_tok = reveal_test_ds.map(tokenize_fn, batched=True)
reveal_test_tok = reveal_test_tok.rename_column("label", "labels")
reveal_test_tok.set_format("torch", columns=["input_ids", "attention_mask", "labels"])


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()
    probs = torch.softmax(torch.tensor(logits), dim=1).numpy()[:, 1]
    try:
        auc = roc_auc_score(labels, probs)
    except ValueError:
        auc = float('nan')
    return {
        "accuracy":    accuracy_score(labels, preds),
        "precision":   precision_score(labels, preds, zero_division=0),
        "recall":      recall_score(labels, preds, zero_division=0),
        "specificity": tn / (tn + fp) if (tn + fp) > 0 else 0,
        "fpr":         fp / (fp + tn) if (fp + tn) > 0 else 0,
        "f1":          f1_score(labels, preds, zero_division=0),
        "mcc":         matthews_corrcoef(labels, preds),
        "kappa":       cohen_kappa_score(labels, preds),
        "mse":         mean_squared_error(labels, preds),
        "mae":         mean_absolute_error(labels, preds),
        "auc":         auc
    }


trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

metrics = trainer.evaluate(eval_dataset=reveal_test_tok)

# ====== PRINT METRICS ======
print("\n=== Evaluation on Reveal Dataset ===")
for key, value in metrics.items():
    if key.startswith("eval_"):
        print(f"{key[5:]:<12}: {value:.4f}")


Map: 100%|██████████| 2028/2028 [00:00<00:00, 3839.83 examples/s]
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)



=== Evaluation on Reveal Dataset ===
loss        : 0.7242
accuracy    : 0.5158
precision   : 0.5144
recall      : 0.5621
specificity : 0.4694
fpr         : 0.5306
f1          : 0.5372
mcc         : 0.0317
kappa       : 0.0316
mse         : 0.4842
mae         : 0.4842
auc         : 0.5109
runtime     : 19.5568
samples_per_second: 103.6980
steps_per_second: 12.9880


mixvul fol

In [24]:
import pandas as pd
mix_test_df = pd.read_csv("mix_test_vultest.csv")
mix_test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2864 entries, 0 to 2863
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   instruction  2864 non-null   object
 1   input        2864 non-null   object
 2   output       2864 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 67.2+ KB


In [25]:
import numpy as np
import torch
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer
)
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, matthews_corrcoef,
    cohen_kappa_score, mean_squared_error,
    mean_absolute_error, roc_auc_score
)

# ====== LOAD SAVED MODEL ======
saved_dir = './unixcoder_trainFOL_testFUNC_saved'
model = AutoModelForSequenceClassification.from_pretrained(saved_dir)
tokenizer = AutoTokenizer.from_pretrained(saved_dir)
data_collator = DataCollatorWithPadding(tokenizer)

# ====== LOAD AND PROCESS NEW TEST DATA ======


# Standardize column names and drop missing values
mix_test_df = mix_test_df.rename(columns={"input": "func", "output": "label"})
mix_test_df = mix_test_df.dropna(subset=["func", "label"])


# Convert to HuggingFace Dataset
mix_test_ds = Dataset.from_pandas(mix_test_df)

# Tokenization (FUNC only, no FOL)
def tokenize_fn(batch):
    return tokenizer(
        ["FUNC: " + c for c in batch["func"]],
        truncation=True, padding='max_length', max_length=256
    )

mix_test_tok = mix_test_ds.map(tokenize_fn, batched=True)
mix_test_tok = mix_test_tok.rename_column("label", "labels")
mix_test_tok.set_format("torch", columns=["input_ids", "attention_mask", "labels"])


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()
    probs = torch.softmax(torch.tensor(logits), dim=1).numpy()[:, 1]
    try:
        auc = roc_auc_score(labels, probs)
    except ValueError:
        auc = float('nan')
    return {
        "accuracy":    accuracy_score(labels, preds),
        "precision":   precision_score(labels, preds, zero_division=0),
        "recall":      recall_score(labels, preds, zero_division=0),
        "specificity": tn / (tn + fp) if (tn + fp) > 0 else 0,
        "fpr":         fp / (fp + tn) if (fp + tn) > 0 else 0,
        "f1":          f1_score(labels, preds, zero_division=0),
        "mcc":         matthews_corrcoef(labels, preds),
        "kappa":       cohen_kappa_score(labels, preds),
        "mse":         mean_squared_error(labels, preds),
        "mae":         mean_absolute_error(labels, preds),
        "auc":         auc
    }


trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

metrics = trainer.evaluate(eval_dataset=mix_test_tok)

# ====== PRINT METRICS ======
print("\n=== Evaluation on mix Dataset ===")
for key, value in metrics.items():
    if key.startswith("eval_"):
        print(f"{key[5:]:<12}: {value:.4f}")


Map: 100%|██████████| 2864/2864 [00:00<00:00, 4212.05 examples/s]
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)



=== Evaluation on mix Dataset ===
loss        : 0.6836
accuracy    : 0.5660
precision   : 0.5541
recall      : 0.6760
specificity : 0.4560
fpr         : 0.5440
f1          : 0.6090
mcc         : 0.1353
kappa       : 0.1320
mse         : 0.4340
mae         : 0.4340
auc         : 0.5942
runtime     : 26.8396
samples_per_second: 106.7080
steps_per_second: 13.3390


cvefixes fol

In [26]:
import pandas as pd

fun_test_df = pd.read_json("test_512.json")
fun_test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4216 entries, 0 to 4215
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   instruction  4216 non-null   object
 1   input        4216 non-null   object
 2   output       4216 non-null   int64 
 3   idx          4216 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 131.9+ KB


In [27]:
import numpy as np
import torch
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer
)
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, matthews_corrcoef,
    cohen_kappa_score, mean_squared_error,
    mean_absolute_error, roc_auc_score
)

# ====== LOAD SAVED MODEL ======
saved_dir = './unixcoder_trainFOL_testFUNC_saved'
model = AutoModelForSequenceClassification.from_pretrained(saved_dir)
tokenizer = AutoTokenizer.from_pretrained(saved_dir)
data_collator = DataCollatorWithPadding(tokenizer)

# ====== LOAD AND PROCESS NEW TEST DATA ======
# Replace with your actual new test DataFrame


# Standardize column names and drop missing values
fun_test_df = fun_test_df.rename(columns={"input": "func", "output": "label"})
fun_test_df = fun_test_df.dropna(subset=["func", "label"])


# Convert to HuggingFace Dataset
fun_test_ds = Dataset.from_pandas(fun_test_df)

# Tokenization (FUNC only, no FOL)
def tokenize_fn(batch):
    return tokenizer(
        ["FUNC: " + c for c in batch["func"]],
        truncation=True, padding='max_length', max_length=256
    )

fun_test_tok = fun_test_ds.map(tokenize_fn, batched=True)
fun_test_tok = fun_test_tok.rename_column("label", "labels")
fun_test_tok.set_format("torch", columns=["input_ids", "attention_mask", "labels"])


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()
    probs = torch.softmax(torch.tensor(logits), dim=1).numpy()[:, 1]
    try:
        auc = roc_auc_score(labels, probs)
    except ValueError:
        auc = float('nan')
    return {
        "accuracy":    accuracy_score(labels, preds),
        "precision":   precision_score(labels, preds, zero_division=0),
        "recall":      recall_score(labels, preds, zero_division=0),
        "specificity": tn / (tn + fp) if (tn + fp) > 0 else 0,
        "fpr":         fp / (fp + tn) if (fp + tn) > 0 else 0,
        "f1":          f1_score(labels, preds, zero_division=0),
        "mcc":         matthews_corrcoef(labels, preds),
        "kappa":       cohen_kappa_score(labels, preds),
        "mse":         mean_squared_error(labels, preds),
        "mae":         mean_absolute_error(labels, preds),
        "auc":         auc
    }


trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

metrics = trainer.evaluate(eval_dataset=fun_test_tok)

# ====== PRINT METRICS ======
print("\n=== Evaluation on VULLM fol without Dataset ===")
for key, value in metrics.items():
    if key.startswith("eval_"):
        print(f"{key[5:]:<12}: {value:.4f}")


Map: 100%|██████████| 4216/4216 [00:01<00:00, 3772.70 examples/s]
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)



=== Evaluation on VULLM fol without Dataset ===
loss        : 0.7170
accuracy    : 0.5019
precision   : 0.5014
recall      : 0.6722
specificity : 0.3316
fpr         : 0.6684
f1          : 0.5744
mcc         : 0.0040
kappa       : 0.0038
mse         : 0.4981
mae         : 0.4981
auc         : 0.5165
runtime     : 45.8489
samples_per_second: 91.9540
steps_per_second: 11.4940


without FOL devign

In [28]:
import numpy as np
import torch
import pandas as pd
import time
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, matthews_corrcoef,
    cohen_kappa_score, mean_squared_error,
    mean_absolute_error, roc_auc_score
)

# ====== STEP 1: Data Preprocessing ======
train_df = train_df.rename(columns={
    "func_cleaned": "func",
    "target": "label",

})
test_df = test_df.rename(columns={
    "func_cleaned": "func",
    "target": "label"
})

train_df = train_df.dropna(subset=[ "func", "label"])
test_df = test_df.dropna(subset=["func", "label"])

train_df["label"] = train_df["label"].astype(int)
test_df["label"] = test_df["label"].astype(int)

train_ds = Dataset.from_pandas(train_df)
test_ds  = Dataset.from_pandas(test_df)

# ====== STEP 2: Tokenizer Setup ======
model_name = "microsoft/unixcoder-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Train: use FOL + FUNC

def tokenize_fn_train(batch):
    return tokenizer(
        ["FUNC: " + c for c in batch["func"]],
        truncation=True, padding='max_length', max_length=256
    )


# Test: use FUNC only
def tokenize_fn_test(batch):
    return tokenizer(
        ["FUNC: " + c for c in batch["func"]],
        truncation=True, padding='max_length', max_length=256
    )

train_tok = train_ds.map(tokenize_fn_train, batched=True)
test_tok  = test_ds.map(tokenize_fn_test, batched=True)

# ====== STEP 3: Prepare datasets ======
train_tok = train_tok.rename_column('label', 'labels')
test_tok  = test_tok.rename_column('label', 'labels')
train_tok.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
test_tok.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

# ====== STEP 4: Load UnixCoder Model ======
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
model.config.problem_type = "single_label_classification"
model.config.id2label = {0: "safe", 1: "vuln"}
model.config.label2id = {"safe": 0, "vuln": 1}

# ====== STEP 5: Metrics ======
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()
    probs = torch.softmax(torch.tensor(logits), dim=1).numpy()[:, 1]
    try:
        auc = roc_auc_score(labels, probs)
    except ValueError:
        auc = float('nan')
    return {
        "accuracy":    accuracy_score(labels, preds),
        "precision":   precision_score(labels, preds, zero_division=0),
        "recall":      recall_score(labels, preds, zero_division=0),
        "specificity": tn / (tn + fp) if (tn + fp) > 0 else 0,
        "fpr":         fp / (fp + tn) if (fp + tn) > 0 else 0,
        "f1":          f1_score(labels, preds, zero_division=0),
        "mcc":         matthews_corrcoef(labels, preds),
        "kappa":       cohen_kappa_score(labels, preds),
        "mse":         mean_squared_error(labels, preds),
        "mae":         mean_absolute_error(labels, preds),
        "auc":         auc
    }

# ====== STEP 6: Training Setup ======
data_collator = DataCollatorWithPadding(tokenizer)
args = TrainingArguments(
    output_dir='./unixcoder_trainFOL_testFUNC',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    logging_steps=50
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_tok,
    eval_dataset=test_tok,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]
)

# ====== STEP 7: Train ======
start_train = time.time()
trainer.train()
end_train = time.time()
train_runtime = end_train - start_train
print(f"\n=== Training completed in {train_runtime:.2f} seconds ({train_runtime / 60:.2f} minutes) ===")

# ====== STEP 8: Evaluate ======
start_eval = time.time()
metrics = trainer.evaluate()
end_eval = time.time()
eval_runtime = end_eval - start_eval
print(f"\n=== Evaluation completed in {eval_runtime:.2f} seconds ({eval_runtime / 60:.2f} minutes) ===")

print("\n=== Hold‐out Metrics (Trained with FOL + FUNC, Tested with FUNC only) ===")
for key, value in metrics.items():
    if key.startswith("eval_"):
        print(f"{key[5:]:<12}: {value:.4f}")

print(f"Train Time (s): {train_runtime:.2f}")
print(f"Eval Time (s):  {eval_runtime:.2f}")

# ====== STEP 9: Save Model ======
save_dir = './unixcoder_trainFUNC_testFUNC_saved'
trainer.save_model(save_dir)
tokenizer.save_pretrained(save_dir)
print(f"\nModel and tokenizer saved to {save_dir}")


Map: 100%|██████████| 7952/7952 [00:07<00:00, 1041.78 examples/s]
Map: 100%|██████████| 2807/2807 [00:03<00:00, 762.68 examples/s] 
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/unixcoder-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,Specificity,Fpr,F1,Mcc,Kappa,Mse,Mae,Auc
1,0.6426,0.670381,0.582472,0.518182,0.515625,0.633564,0.366436,0.5169,0.149277,0.149276,0.417528,0.417528,0.602325
2,0.6467,0.660021,0.586391,0.514543,0.800164,0.423004,0.576996,0.626328,0.235853,0.209466,0.413609,0.413609,0.680063
3,0.5126,0.694906,0.631279,0.571542,0.594572,0.659334,0.340666,0.58283,0.252864,0.252706,0.368721,0.368721,0.69137


Checkpoint destination directory ./unixcoder_trainFOL_testFUNC\checkpoint-994 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./unixcoder_trainFOL_testFUNC\checkpoint-1988 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./unixcoder_trainFOL_testFUNC\checkpoint-2982 already exists and is non-empty. Saving will proceed but saved results may be invalid.



=== Training completed in 1100.78 seconds (18.35 minutes) ===



=== Evaluation completed in 28.84 seconds (0.48 minutes) ===

=== Hold‐out Metrics (Trained with FOL + FUNC, Tested with FUNC only) ===
loss        : 0.6600
accuracy    : 0.5864
precision   : 0.5145
recall      : 0.8002
specificity : 0.4230
fpr         : 0.5770
f1          : 0.6263
mcc         : 0.2359
kappa       : 0.2095
mse         : 0.4136
mae         : 0.4136
auc         : 0.6801
runtime     : 28.8316
samples_per_second: 97.3580
steps_per_second: 12.1740
Train Time (s): 1100.78
Eval Time (s):  28.84

Model and tokenizer saved to ./unixcoder_trainFUNC_testFUNC_saved


bigvul without

In [29]:
import numpy as np
import torch
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer
)
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, matthews_corrcoef,
    cohen_kappa_score, mean_squared_error,
    mean_absolute_error, roc_auc_score
)

# ====== LOAD SAVED MODEL ======
saved_dir = './unixcoder_trainFUNC_testFUNC_saved'
model = AutoModelForSequenceClassification.from_pretrained(saved_dir)
tokenizer = AutoTokenizer.from_pretrained(saved_dir)
data_collator = DataCollatorWithPadding(tokenizer)

# ====== LOAD AND PROCESS NEW TEST DATA ======
# Replace with your actual new test DataFrame


# Standardize column names and drop missing values
gbig_test_df = gbig_test_df.rename(columns={"input": "func", "output": "label"})
gbig_test_df = gbig_test_df.dropna(subset=["func", "label"])


# Convert to HuggingFace Dataset
gbig_test_ds = Dataset.from_pandas(gbig_test_df)

# Tokenization (FUNC only, no FOL)
def tokenize_fn(batch):
    return tokenizer(
        ["FUNC: " + c for c in batch["func"]],
        truncation=True, padding='max_length', max_length=256
    )

big_test_tok = gbig_test_ds.map(tokenize_fn, batched=True)
big_test_tok = big_test_tok.rename_column("label", "labels")
big_test_tok.set_format("torch", columns=["input_ids", "attention_mask", "labels"])


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()
    probs = torch.softmax(torch.tensor(logits), dim=1).numpy()[:, 1]
    try:
        auc = roc_auc_score(labels, probs)
    except ValueError:
        auc = float('nan')
    return {
        "accuracy":    accuracy_score(labels, preds),
        "precision":   precision_score(labels, preds, zero_division=0),
        "recall":      recall_score(labels, preds, zero_division=0),
        "specificity": tn / (tn + fp) if (tn + fp) > 0 else 0,
        "fpr":         fp / (fp + tn) if (fp + tn) > 0 else 0,
        "f1":          f1_score(labels, preds, zero_division=0),
        "mcc":         matthews_corrcoef(labels, preds),
        "kappa":       cohen_kappa_score(labels, preds),
        "mse":         mean_squared_error(labels, preds),
        "mae":         mean_absolute_error(labels, preds),
        "auc":         auc
    }


trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

metrics = trainer.evaluate(eval_dataset=big_test_tok)

# ====== PRINT METRICS ======
print("\n=== Evaluation on BigVul Dataset ===")
for key, value in metrics.items():
    if key.startswith("eval_"):
        print(f"{key[5:]:<12}: {value:.4f}")


Map: 100%|██████████| 1170/1170 [00:00<00:00, 3857.19 examples/s]
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)



=== Evaluation on BigVul Dataset ===
loss        : 0.8055
accuracy    : 0.4744
precision   : 0.4851
recall      : 0.8359
specificity : 0.1128
fpr         : 0.8872
f1          : 0.6139
mcc         : -0.0742
kappa       : -0.0513
mse         : 0.5256
mae         : 0.5256
auc         : 0.3891
runtime     : 11.5845
samples_per_second: 100.9970
steps_per_second: 12.6890


diverse vul without

In [30]:
import numpy as np
import torch
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer
)
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, matthews_corrcoef,
    cohen_kappa_score, mean_squared_error,
    mean_absolute_error, roc_auc_score
)

# ====== LOAD SAVED MODEL ======
saved_dir = './unixcoder_trainFUNC_testFUNC_saved'
model = AutoModelForSequenceClassification.from_pretrained(saved_dir)
tokenizer = AutoTokenizer.from_pretrained(saved_dir)
data_collator = DataCollatorWithPadding(tokenizer)

# ====== LOAD AND PROCESS NEW TEST DATA ======
# Replace with your actual new test DataFrame


# Standardize column names and drop missing values
gdiv_test_df = gdiv_test_df.rename(columns={"code_snip": "func", "output": "label"})
gdiv_test_df= gdiv_test_df.dropna(subset=["func", "label"])


# Convert to HuggingFace Dataset
gdiv_test_ds = Dataset.from_pandas(gdiv_test_df)

# Tokenization (FUNC only, no FOL)
def tokenize_fn(batch):
    return tokenizer(
        ["FUNC: " + c for c in batch["func"]],
        truncation=True, padding='max_length', max_length=256
    )

div_test_tok = gdiv_test_ds.map(tokenize_fn, batched=True)
div_test_tok = div_test_tok.rename_column("label", "labels")
div_test_tok.set_format("torch", columns=["input_ids", "attention_mask", "labels"])


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()
    probs = torch.softmax(torch.tensor(logits), dim=1).numpy()[:, 1]
    try:
        auc = roc_auc_score(labels, probs)
    except ValueError:
        auc = float('nan')
    return {
        "accuracy":    accuracy_score(labels, preds),
        "precision":   precision_score(labels, preds, zero_division=0),
        "recall":      recall_score(labels, preds, zero_division=0),
        "specificity": tn / (tn + fp) if (tn + fp) > 0 else 0,
        "fpr":         fp / (fp + tn) if (fp + tn) > 0 else 0,
        "f1":          f1_score(labels, preds, zero_division=0),
        "mcc":         matthews_corrcoef(labels, preds),
        "kappa":       cohen_kappa_score(labels, preds),
        "mse":         mean_squared_error(labels, preds),
        "mae":         mean_absolute_error(labels, preds),
        "auc":         auc
    }


trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

metrics = trainer.evaluate(eval_dataset=div_test_tok)

# ====== PRINT METRICS ======
print("\n=== Evaluation on Diverse Dataset ===")
for key, value in metrics.items():
    if key.startswith("eval_"):
        print(f"{key[5:]:<12}: {value:.4f}")


Map: 100%|██████████| 1532/1532 [00:00<00:00, 1952.40 examples/s]
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)



=== Evaluation on Diverse Dataset ===
loss        : 0.7949
accuracy    : 0.5052
precision   : 0.5028
recall      : 0.9465
specificity : 0.0640
fpr         : 0.9360
f1          : 0.6567
mcc         : 0.0222
kappa       : 0.0104
mse         : 0.4948
mae         : 0.4948
auc         : 0.5041
runtime     : 20.0009
samples_per_second: 76.5970
steps_per_second: 9.6000


juliet wthout fol

In [31]:
import numpy as np
import torch
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer
)
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, matthews_corrcoef,
    cohen_kappa_score, mean_squared_error,
    mean_absolute_error, roc_auc_score
)

# ====== LOAD SAVED MODEL ======
saved_dir = './unixcoder_trainFUNC_testFUNC_saved'
model = AutoModelForSequenceClassification.from_pretrained(saved_dir)
tokenizer = AutoTokenizer.from_pretrained(saved_dir)
data_collator = DataCollatorWithPadding(tokenizer)

# ====== LOAD AND PROCESS NEW TEST DATA ======
# Replace with your actual new test DataFrame


# Standardize column names and drop missing values
juliet_test_df = juliet_test_df.rename(columns={"code_snip": "func", "output": "label"})
juliet_test_df = juliet_test_df.dropna(subset=["func", "label"])


# Convert to HuggingFace Dataset
juliet_test_ds = Dataset.from_pandas(juliet_test_df)

# Tokenization (FUNC only, no FOL)
def tokenize_fn(batch):
    return tokenizer(
        ["FUNC: " + c for c in batch["func"]],
        truncation=True, padding='max_length', max_length=256
    )

juliet_test_tok = juliet_test_ds.map(tokenize_fn, batched=True)
juliet_test_tok = juliet_test_tok.rename_column("label", "labels")
juliet_test_tok.set_format("torch", columns=["input_ids", "attention_mask", "labels"])


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()
    probs = torch.softmax(torch.tensor(logits), dim=1).numpy()[:, 1]
    try:
        auc = roc_auc_score(labels, probs)
    except ValueError:
        auc = float('nan')
    return {
        "accuracy":    accuracy_score(labels, preds),
        "precision":   precision_score(labels, preds, zero_division=0),
        "recall":      recall_score(labels, preds, zero_division=0),
        "specificity": tn / (tn + fp) if (tn + fp) > 0 else 0,
        "fpr":         fp / (fp + tn) if (fp + tn) > 0 else 0,
        "f1":          f1_score(labels, preds, zero_division=0),
        "mcc":         matthews_corrcoef(labels, preds),
        "kappa":       cohen_kappa_score(labels, preds),
        "mse":         mean_squared_error(labels, preds),
        "mae":         mean_absolute_error(labels, preds),
        "auc":         auc
    }


trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

metrics = trainer.evaluate(eval_dataset=juliet_test_tok)

# ====== PRINT METRICS ======
print("\n=== Evaluation on Juliet Dataset ===")
for key, value in metrics.items():
    if key.startswith("eval_"):
        print(f"{key[5:]:<12}: {value:.4f}")


Map: 100%|██████████| 3152/3152 [00:01<00:00, 3018.58 examples/s]
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)



=== Evaluation on Juliet Dataset ===
loss        : 0.6951
accuracy    : 0.5187
precision   : 0.5097
recall      : 0.9854
specificity : 0.0520
fpr         : 0.9480
f1          : 0.6719
mcc         : 0.1043
kappa       : 0.0374
mse         : 0.4813
mae         : 0.4813
auc         : 0.6181
runtime     : 31.3627
samples_per_second: 100.5020
steps_per_second: 12.5630


RVL without

In [32]:
import numpy as np
import torch
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer
)
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, matthews_corrcoef,
    cohen_kappa_score, mean_squared_error,
    mean_absolute_error, roc_auc_score
)

# ====== LOAD SAVED MODEL ======
saved_dir = './unixcoder_trainFUNC_testFUNC_saved'
model = AutoModelForSequenceClassification.from_pretrained(saved_dir)
tokenizer = AutoTokenizer.from_pretrained(saved_dir)
data_collator = DataCollatorWithPadding(tokenizer)

# ====== LOAD AND PROCESS NEW TEST DATA ======
# Replace with your actual new test DataFrame
reveal_test_df = pd.read_csv("Reveal_vultest.csv")

# Standardize column names and drop missing values
reveal_test_df = reveal_test_df.rename(columns={"input": "func", "output": "label"})
reveal_test_df = reveal_test_df.dropna(subset=["func", "label"])


# Convert to HuggingFace Dataset
reveal_test_ds = Dataset.from_pandas(reveal_test_df)

# Tokenization (FUNC only, no FOL)
def tokenize_fn(batch):
    return tokenizer(
        ["FUNC: " + c for c in batch["func"]],
        truncation=True, padding='max_length', max_length=256
    )

reveal_test_tok = reveal_test_ds.map(tokenize_fn, batched=True)
reveal_test_tok = reveal_test_tok.rename_column("label", "labels")
reveal_test_tok.set_format("torch", columns=["input_ids", "attention_mask", "labels"])


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()
    probs = torch.softmax(torch.tensor(logits), dim=1).numpy()[:, 1]
    try:
        auc = roc_auc_score(labels, probs)
    except ValueError:
        auc = float('nan')
    return {
        "accuracy":    accuracy_score(labels, preds),
        "precision":   precision_score(labels, preds, zero_division=0),
        "recall":      recall_score(labels, preds, zero_division=0),
        "specificity": tn / (tn + fp) if (tn + fp) > 0 else 0,
        "fpr":         fp / (fp + tn) if (fp + tn) > 0 else 0,
        "f1":          f1_score(labels, preds, zero_division=0),
        "mcc":         matthews_corrcoef(labels, preds),
        "kappa":       cohen_kappa_score(labels, preds),
        "mse":         mean_squared_error(labels, preds),
        "mae":         mean_absolute_error(labels, preds),
        "auc":         auc
    }


trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

metrics = trainer.evaluate(eval_dataset=reveal_test_tok)

# ====== PRINT METRICS ======
print("\n=== Evaluation on Reveal Dataset ===")
for key, value in metrics.items():
    if key.startswith("eval_"):
        print(f"{key[5:]:<12}: {value:.4f}")


Map: 100%|██████████| 2028/2028 [00:00<00:00, 3146.40 examples/s]
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)



=== Evaluation on Reveal Dataset ===
loss        : 0.8107
accuracy    : 0.4882
precision   : 0.4936
recall      : 0.9112
specificity : 0.0651
fpr         : 0.9349
f1          : 0.6403
mcc         : -0.0444
kappa       : -0.0237
mse         : 0.5118
mae         : 0.5118
auc         : 0.4366
runtime     : 20.1701
samples_per_second: 100.5450
steps_per_second: 12.5930


mixvul without

In [33]:
import numpy as np
import torch
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer
)
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, matthews_corrcoef,
    cohen_kappa_score, mean_squared_error,
    mean_absolute_error, roc_auc_score
)

# ====== LOAD SAVED MODEL ======
saved_dir = './unixcoder_trainFUNC_testFUNC_saved'
model = AutoModelForSequenceClassification.from_pretrained(saved_dir)
tokenizer = AutoTokenizer.from_pretrained(saved_dir)
data_collator = DataCollatorWithPadding(tokenizer)

# ====== LOAD AND PROCESS NEW TEST DATA ======
# Replace with your actual new test DataFrame


# Standardize column names and drop missing values
mix_test_df = mix_test_df.rename(columns={"input": "func", "output": "label"})
mix_test_df = mix_test_df.dropna(subset=["func", "label"])


# Convert to HuggingFace Dataset
mix_test_ds = Dataset.from_pandas(mix_test_df)

# Tokenization (FUNC only, no FOL)
def tokenize_fn(batch):
    return tokenizer(
        ["FUNC: " + c for c in batch["func"]],
        truncation=True, padding='max_length', max_length=256
    )

mix_test_tok = mix_test_ds.map(tokenize_fn, batched=True)
mix_test_tok = mix_test_tok.rename_column("label", "labels")
mix_test_tok.set_format("torch", columns=["input_ids", "attention_mask", "labels"])


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()
    probs = torch.softmax(torch.tensor(logits), dim=1).numpy()[:, 1]
    try:
        auc = roc_auc_score(labels, probs)
    except ValueError:
        auc = float('nan')
    return {
        "accuracy":    accuracy_score(labels, preds),
        "precision":   precision_score(labels, preds, zero_division=0),
        "recall":      recall_score(labels, preds, zero_division=0),
        "specificity": tn / (tn + fp) if (tn + fp) > 0 else 0,
        "fpr":         fp / (fp + tn) if (fp + tn) > 0 else 0,
        "f1":          f1_score(labels, preds, zero_division=0),
        "mcc":         matthews_corrcoef(labels, preds),
        "kappa":       cohen_kappa_score(labels, preds),
        "mse":         mean_squared_error(labels, preds),
        "mae":         mean_absolute_error(labels, preds),
        "auc":         auc
    }


trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

metrics = trainer.evaluate(eval_dataset=mix_test_tok)

# ====== PRINT METRICS ======
print("\n=== Evaluation on mix withour Dataset ===")
for key, value in metrics.items():
    if key.startswith("eval_"):
        print(f"{key[5:]:<12}: {value:.4f}")


Map: 100%|██████████| 2864/2864 [00:00<00:00, 3037.51 examples/s]
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)



=== Evaluation on mix withour Dataset ===
loss        : 0.7587
accuracy    : 0.5297
precision   : 0.5166
recall      : 0.9211
specificity : 0.1383
fpr         : 0.8617
f1          : 0.6620
mcc         : 0.0954
kappa       : 0.0594
mse         : 0.4703
mae         : 0.4703
auc         : 0.5494
runtime     : 28.1978
samples_per_second: 101.5680
steps_per_second: 12.6960


CVE fixes without

In [35]:
import numpy as np
import torch
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer
)
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, matthews_corrcoef,
    cohen_kappa_score, mean_squared_error,
    mean_absolute_error, roc_auc_score
)

# ====== LOAD SAVED MODEL ======
saved_dir = './unixcoder_trainFUNC_testFUNC_saved'
model = AutoModelForSequenceClassification.from_pretrained(saved_dir)
tokenizer = AutoTokenizer.from_pretrained(saved_dir)
data_collator = DataCollatorWithPadding(tokenizer)

# ====== LOAD AND PROCESS NEW TEST DATA ======
# Replace with your actual new test DataFrame


# Standardize column names and drop missing values
fun_test_df = fun_test_df.rename(columns={"input": "func", "output": "label"})
fun_test_df = fun_test_df.dropna(subset=["func", "label"])


# Convert to HuggingFace Dataset
fun_test_ds = Dataset.from_pandas(fun_test_df)

# Tokenization (FUNC only, no FOL)
def tokenize_fn(batch):
    return tokenizer(
        ["FUNC: " + c for c in batch["func"]],
        truncation=True, padding='max_length', max_length=256
    )

fun_test_tok = fun_test_ds.map(tokenize_fn, batched=True)
fun_test_tok = fun_test_tok.rename_column("label", "labels")
fun_test_tok.set_format("torch", columns=["input_ids", "attention_mask", "labels"])


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()
    probs = torch.softmax(torch.tensor(logits), dim=1).numpy()[:, 1]
    try:
        auc = roc_auc_score(labels, probs)
    except ValueError:
        auc = float('nan')
    return {
        "accuracy":    accuracy_score(labels, preds),
        "precision":   precision_score(labels, preds, zero_division=0),
        "recall":      recall_score(labels, preds, zero_division=0),
        "specificity": tn / (tn + fp) if (tn + fp) > 0 else 0,
        "fpr":         fp / (fp + tn) if (fp + tn) > 0 else 0,
        "f1":          f1_score(labels, preds, zero_division=0),
        "mcc":         matthews_corrcoef(labels, preds),
        "kappa":       cohen_kappa_score(labels, preds),
        "mse":         mean_squared_error(labels, preds),
        "mae":         mean_absolute_error(labels, preds),
        "auc":         auc
    }


trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

metrics = trainer.evaluate(eval_dataset=fun_test_tok)

# ====== PRINT METRICS ======
print("\n=== Evaluation on VULLM without Dataset ===")
for key, value in metrics.items():
    if key.startswith("eval_"):
        print(f"{key[5:]:<12}: {value:.4f}")


Map: 100%|██████████| 4216/4216 [00:01<00:00, 2957.51 examples/s]
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)



=== Evaluation on VULLM without Dataset ===
loss        : 0.7981
accuracy    : 0.5021
precision   : 0.5011
recall      : 0.9554
specificity : 0.0489
fpr         : 0.9511
f1          : 0.6574
mcc         : 0.0101
kappa       : 0.0043
mse         : 0.4979
mae         : 0.4979
auc         : 0.5032
runtime     : 40.7854
samples_per_second: 103.3700
steps_per_second: 12.9210
