In [4]:
import pandas as pd

In [5]:
cve_test_df = pd.read_csv("nasa_test.csv")
cve_test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2437 entries, 0 to 2436
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   repo    1262 non-null   object
 1   func    2437 non-null   object
 2   label   2437 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 57.2+ KB


In [8]:
print(cve_test_df['label'].value_counts())


label
1    1262
0    1175
Name: count, dtype: int64


In [11]:
import numpy as np
import torch
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer
)
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, matthews_corrcoef,
    cohen_kappa_score, mean_squared_error,
    mean_absolute_error, roc_auc_score
)

# ====== Load Test Data ======
df = pd.read_csv("nasa_test.csv")


# Keep only binary labels (0 and 1)
df = df[df["label"].isin([0, 1])].dropna(subset=["func", "label"])

# ====== Load Model ======
model_path = "./codebert_trainFUNC_testFUNC_saved"
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")  # tokenizer wasn't saved
data_collator = DataCollatorWithPadding(tokenizer)

# ====== Preprocess Test Set ======
test_ds = Dataset.from_pandas(df)
test_ds = test_ds.map(lambda batch: tokenizer(["FUNC: " + x for x in batch["func"]],
                                              truncation=True, padding='max_length', max_length=256), batched=True)
test_ds = test_ds.rename_column("label", "labels")
test_ds.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

# ====== Define Metrics ======
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)

    try:
        tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()
    except:
        tn = fp = fn = tp = 0  # fallback for edge cases

    probs = torch.softmax(torch.tensor(logits), dim=1).numpy()[:, 1]
    try:
        auc = roc_auc_score(labels, probs)
    except ValueError:
        auc = float('nan')

    return {
        "accuracy": accuracy_score(labels, preds),
        "precision": precision_score(labels, preds, zero_division=0),
        "recall": recall_score(labels, preds, zero_division=0),
        "specificity": tn / (tn + fp) if (tn + fp) > 0 else 0,
        "fpr": fp / (fp + tn) if (fp + tn) > 0 else 0,
        "f1": f1_score(labels, preds, zero_division=0),
        "mcc": matthews_corrcoef(labels, preds),
        "kappa": cohen_kappa_score(labels, preds),
        "mse": mean_squared_error(labels, preds),
        "mae": mean_absolute_error(labels, preds),
        "auc": auc
    }

# ====== Run Evaluation ======
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

metrics = trainer.evaluate(eval_dataset=test_ds)

# ====== Print Metrics ======
print("\n=== Evaluation on Devign Test Dataset ===")
for k, v in metrics.items():
    if k.startswith("eval_"):
        print(f"{k[5:]:<12}: {v:.4f}")


Map: 100%|██████████| 2437/2437 [00:00<00:00, 5325.46 examples/s]
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)



=== Evaluation on Devign Test Dataset ===
loss        : 2.9406
accuracy    : 0.2556
precision   : 0.3465
recall      : 0.4937
specificity : 0.0000
fpr         : 1.0000
f1          : 0.4072
mcc         : -0.5752
kappa       : -0.5144
mse         : 0.7444
mae         : 0.7444
auc         : 0.0799
runtime     : 16.1377
samples_per_second: 151.0130
steps_per_second: 18.9000


In [12]:
import numpy as np
import torch
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer
)
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, matthews_corrcoef,
    cohen_kappa_score, mean_squared_error,
    mean_absolute_error, roc_auc_score
)

# ====== Load Test Data ======
df = pd.read_csv("nasa_test.csv")


# Keep only binary labels (0 and 1)
df = df[df["label"].isin([0, 1])].dropna(subset=["func", "label"])

# ====== Load Model ======
model_path = "./graphcodebert_trainFUNC_testFUNC_saved"
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained("microsoft/graphcodebert-base")  # tokenizer wasn't saved
data_collator = DataCollatorWithPadding(tokenizer)

# ====== Preprocess Test Set ======
test_ds = Dataset.from_pandas(df)
test_ds = test_ds.map(lambda batch: tokenizer(["FUNC: " + x for x in batch["func"]],
                                              truncation=True, padding='max_length', max_length=256), batched=True)
test_ds = test_ds.rename_column("label", "labels")
test_ds.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

# ====== Define Metrics ======
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)

    try:
        tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()
    except:
        tn = fp = fn = tp = 0  # fallback for edge cases

    probs = torch.softmax(torch.tensor(logits), dim=1).numpy()[:, 1]
    try:
        auc = roc_auc_score(labels, probs)
    except ValueError:
        auc = float('nan')

    return {
        "accuracy": accuracy_score(labels, preds),
        "precision": precision_score(labels, preds, zero_division=0),
        "recall": recall_score(labels, preds, zero_division=0),
        "specificity": tn / (tn + fp) if (tn + fp) > 0 else 0,
        "fpr": fp / (fp + tn) if (fp + tn) > 0 else 0,
        "f1": f1_score(labels, preds, zero_division=0),
        "mcc": matthews_corrcoef(labels, preds),
        "kappa": cohen_kappa_score(labels, preds),
        "mse": mean_squared_error(labels, preds),
        "mae": mean_absolute_error(labels, preds),
        "auc": auc
    }

# ====== Run Evaluation ======
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

metrics = trainer.evaluate(eval_dataset=test_ds)

# ====== Print Metrics ======
print("\n=== Evaluation on Devign Test Dataset ===")
for k, v in metrics.items():
    if k.startswith("eval_"):
        print(f"{k[5:]:<12}: {v:.4f}")


Map: 100%|██████████| 2437/2437 [00:00<00:00, 3905.77 examples/s]
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)



=== Evaluation on Devign Test Dataset ===
loss        : 4.0614
accuracy    : 0.2302
precision   : 0.3232
recall      : 0.4445
specificity : 0.0000
fpr         : 1.0000
f1          : 0.3742
mcc         : -0.6132
kappa       : -0.5633
mse         : 0.7698
mae         : 0.7698
auc         : 0.0751
runtime     : 16.4403
samples_per_second: 148.2330
steps_per_second: 18.5520


In [13]:
import numpy as np
import torch
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer
)
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, matthews_corrcoef,
    cohen_kappa_score, mean_squared_error,
    mean_absolute_error, roc_auc_score
)

# ====== Load Test Data ======
df = pd.read_csv("nasa_test.csv")


# Keep only binary labels (0 and 1)
df = df[df["label"].isin([0, 1])].dropna(subset=["func", "label"])

# ====== Load Model ======
model_path = "./unixcoder_trainFUNC_testFUNC_saved"
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained("microsoft/unixcoder-base")  # tokenizer wasn't saved
data_collator = DataCollatorWithPadding(tokenizer)

# ====== Preprocess Test Set ======
test_ds = Dataset.from_pandas(df)
test_ds = test_ds.map(lambda batch: tokenizer(["FUNC: " + x for x in batch["func"]],
                                              truncation=True, padding='max_length', max_length=256), batched=True)
test_ds = test_ds.rename_column("label", "labels")
test_ds.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

# ====== Define Metrics ======
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)

    try:
        tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()
    except:
        tn = fp = fn = tp = 0  # fallback for edge cases

    probs = torch.softmax(torch.tensor(logits), dim=1).numpy()[:, 1]
    try:
        auc = roc_auc_score(labels, probs)
    except ValueError:
        auc = float('nan')

    return {
        "accuracy": accuracy_score(labels, preds),
        "precision": precision_score(labels, preds, zero_division=0),
        "recall": recall_score(labels, preds, zero_division=0),
        "specificity": tn / (tn + fp) if (tn + fp) > 0 else 0,
        "fpr": fp / (fp + tn) if (fp + tn) > 0 else 0,
        "f1": f1_score(labels, preds, zero_division=0),
        "mcc": matthews_corrcoef(labels, preds),
        "kappa": cohen_kappa_score(labels, preds),
        "mse": mean_squared_error(labels, preds),
        "mae": mean_absolute_error(labels, preds),
        "auc": auc
    }

# ====== Run Evaluation ======
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

metrics = trainer.evaluate(eval_dataset=test_ds)

# ====== Print Metrics ======
print("\n=== Evaluation on Devign Test Dataset ===")
for k, v in metrics.items():
    if k.startswith("eval_"):
        print(f"{k[5:]:<12}: {v:.4f}")


Map: 100%|██████████| 2437/2437 [00:00<00:00, 5255.85 examples/s]
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)



=== Evaluation on Devign Test Dataset ===
loss        : 0.9260
accuracy    : 0.4522
precision   : 0.4840
recall      : 0.8732
specificity : 0.0000
fpr         : 1.0000
f1          : 0.6228
mcc         : -0.2558
kappa       : -0.1307
mse         : 0.5478
mae         : 0.5478
auc         : 0.0607
runtime     : 16.0145
samples_per_second: 152.1740
steps_per_second: 19.0450
