In [1]:
import pandas as pd
test_df = pd.read_csv('/Users/akter/ns_main/dataset/nasa_test.csv')
train_df = pd.read_csv('/Users/akter/ns_main/dataset/nasa_train.csv')

In [2]:
import pandas as pd

# Load datasets
train_df = pd.read_csv('nasa_train.csv')
test_df = pd.read_csv('nasa_test.csv')

# Basic info: rows, columns, data types, nulls
print("=== Train Dataset Info ===")
print(train_df.info())
print("\n=== Test Dataset Info ===")
print(test_df.info())

# Column names
print("\nTrain Columns:", train_df.columns.tolist())
print("Test Columns:", test_df.columns.tolist())

# Check for missing values
print("\nMissing values in Train:\n", train_df.isnull().sum())
print("Missing values in Test:\n", test_df.isnull().sum())

# First few rows
print("\nTrain Sample Rows:\n", train_df.head())
print("\nTest Sample Rows:\n", test_df.head())

# Class distribution if 'label' column exists
if 'label' in train_df.columns:
    print("\nTrain Label Distribution:\n", train_df['label'].value_counts())
if 'label' in test_df.columns:
    print("\nTest Label Distribution:\n", test_df['label'].value_counts())

# Dataset shape
print(f"\nTrain Shape: {train_df.shape}")
print(f"Test Shape: {test_df.shape}")


=== Train Dataset Info ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9744 entries, 0 to 9743
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   repo    4919 non-null   object
 1   func    9744 non-null   object
 2   label   9744 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 228.5+ KB
None

=== Test Dataset Info ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2437 entries, 0 to 2436
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   repo    1262 non-null   object
 1   func    2437 non-null   object
 2   label   2437 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 57.2+ KB
None

Train Columns: ['repo', 'func', 'label']
Test Columns: ['repo', 'func', 'label']

Missing values in Train:
 repo     4825
func        0
label       0
dtype: int64
Missing values in Test:
 repo     1175
func        0
label       0
dtype: int64

Train Sample Row

In [3]:
# Drop 'repo' column from both train and test datasets
train_df = train_df.drop(columns=['repo'])
test_df = test_df.drop(columns=['repo'])

In [4]:
print(train_df.columns)
print(test_df.columns)


Index(['func', 'label'], dtype='object')
Index(['func', 'label'], dtype='object')


In [5]:
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer
)
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    confusion_matrix, matthews_corrcoef, cohen_kappa_score,
    mean_squared_error, mean_absolute_error
)

# === Load and prepare the data ===
train_df = pd.read_csv("nasa_train.csv")[["func", "label"]].dropna()
test_df = pd.read_csv("nasa_test.csv")[["func", "label"]].dropna()
train_ds = Dataset.from_pandas(train_df.rename(columns={"func": "text"}))
test_ds = Dataset.from_pandas(test_df.rename(columns={"func": "text"}))

# === Load tokenizer and model ===
model_name = "microsoft/unixcoder-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
data_collator = DataCollatorWithPadding(tokenizer)

# === Tokenization function ===
def tokenize(batch):
    return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=512)

train_ds = train_ds.map(tokenize, batched=True)
test_ds = test_ds.map(tokenize, batched=True)

# === Compute metrics ===
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=1)
    tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()

    return {
        "accuracy":    accuracy_score(labels, preds),
        "precision":   precision_score(labels, preds, zero_division=0),
        "recall":      recall_score(labels, preds, zero_division=0),
        "specificity": tn / (tn + fp) if (tn + fp) > 0 else 0,
        "fpr":         fp / (fp + tn) if (fp + tn) > 0 else 0,
        "f1":          f1_score(labels, preds, zero_division=0),
        "mcc":         matthews_corrcoef(labels, preds),
        "kappa":       cohen_kappa_score(labels, preds),
        "mse":         mean_squared_error(labels, preds),
        "mae":         mean_absolute_error(labels, preds),
        "auc":         roc_auc_score(labels, preds)
    }

# === Training arguments ===
training_args = TrainingArguments(
    output_dir="./graphcodebert_nasa_output",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="f1"
)

# === Trainer ===
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# === Train the model ===
trainer.train()

# === Save the fine-tuned model and tokenizer ===
save_dir = "./unixcoder_nasa_finetuned"
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)

# === Evaluate and print results ===
metrics = trainer.evaluate()
print("\n=== Evaluation Metrics ===")
for k, v in metrics.items():
    if isinstance(v, float):
        print(f"{k}: {v:.4f}")
    else:
        print(f"{k}: {v}")


  from .autonotebook import tqdm as notebook_tqdm
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/unixcoder-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map:  72%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 7000/9744 [00:01<00:00, 6444.74 examples/s]Error during conversion: ChunkedEncodingError(ProtocolError('Response ended prematurely'))
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9744/9744 [00:01<00:00, 5839.54 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2437/2437 [00:00<00:00, 6595.78 examples/s]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environme

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,Specificity,Fpr,F1,Mcc,Kappa,Mse,Mae,Auc
1,0.0046,5e-06,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0
2,0.0,2e-06,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0
3,0.0,1e-06,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0
4,0.0,1e-06,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0
5,0.0,1e-06,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0



=== Evaluation Metrics ===
eval_loss: 0.0000
eval_accuracy: 1.0000
eval_precision: 1.0000
eval_recall: 1.0000
eval_specificity: 1.0000
eval_fpr: 0.0000
eval_f1: 1.0000
eval_mcc: 1.0000
eval_kappa: 1.0000
eval_mse: 0.0000
eval_mae: 0.0000
eval_auc: 1.0000
eval_runtime: 30.3669
eval_samples_per_second: 80.2520
eval_steps_per_second: 5.0380
epoch: 5.0000


In [None]:
import pandas as pd
CV_test_df= pd.read_csv('/Users/akter/ns_main/dataset/devign_test.csv')
CV_test_df.info()

In [6]:
import pandas as pd
CV_test_df = pd.read_csv("devign_test.csv")
CV_test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 900 entries, 0 to 899
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   input   900 non-null    object
 1   cwe_id  900 non-null    object
 2   output  900 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 21.2+ KB


In [7]:
import numpy as np
import torch
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer
)
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, matthews_corrcoef,
    cohen_kappa_score, mean_squared_error,
    mean_absolute_error, roc_auc_score
)

# ====== LOAD SAVED MODEL ======
saved_dir = "./unixcoder_nasa_finetuned"
model = AutoModelForSequenceClassification.from_pretrained(saved_dir)
tokenizer = AutoTokenizer.from_pretrained(saved_dir)
data_collator = DataCollatorWithPadding(tokenizer)

# ====== LOAD AND PROCESS NEW TEST DATA ======
# Replace with your actual new test DataFrame

CV_test_df= CV_test_df[CV_test_df["output"].isin([0, 1])]

# Standardize column names and drop missing values
CV_test_df = CV_test_df.rename(columns={"input": "func", "output": "label"})
CV_test_df = CV_test_df.dropna(subset=["func", "label"])



CV_test_ds = Dataset.from_pandas(CV_test_df)

# Tokenization (FUNC only, no FOL)
def tokenize_fn(batch):
    return tokenizer(
        ["FUNC: " + c for c in batch["func"]] ,
        truncation=True, padding='max_length', max_length=256
    )

CV_test_tok = CV_test_ds.map(tokenize_fn, batched=True)
CV_test_tok = CV_test_tok.rename_column("label", "labels")
CV_test_tok.set_format("torch", columns=["input_ids", "attention_mask", "labels"])


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()
    probs = torch.softmax(torch.tensor(logits), dim=1).numpy()[:, 1]
    try:
        auc = roc_auc_score(labels, probs)
    except ValueError:
        auc = float('nan')
    return {
        "accuracy":    accuracy_score(labels, preds),
        "precision":   precision_score(labels, preds, zero_division=0),
        "recall":      recall_score(labels, preds, zero_division=0),
        "specificity": tn / (tn + fp) if (tn + fp) > 0 else 0,
        "fpr":         fp / (fp + tn) if (fp + tn) > 0 else 0,
        "f1":          f1_score(labels, preds, zero_division=0),
        "mcc":         matthews_corrcoef(labels, preds),
        "kappa":       cohen_kappa_score(labels, preds),
        "mse":         mean_squared_error(labels, preds),
        "mae":         mean_absolute_error(labels, preds),
        "auc":         auc
    }


trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

metrics = trainer.evaluate(eval_dataset=CV_test_tok)

# ====== PRINT METRICS ======
print("\n=== Evaluation on RVL Dataset ===")
for key, value in metrics.items():
    if key.startswith("eval_"):
        print(f"{key[5:]:<12}: {value:.4f}")


Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 600/600 [00:00<00:00, 4677.77 examples/s]
  trainer = Trainer(
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



=== Evaluation on RVL Dataset ===
loss        : 5.7414
model_preparation_time: 0.0017
accuracy    : 0.5000
precision   : 0.5000
recall      : 1.0000
specificity : 0.0000
fpr         : 1.0000
f1          : 0.6667
mcc         : 0.0000
kappa       : 0.0000
mse         : 0.5000
mae         : 0.5000
auc         : 0.5078
runtime     : 4.0806
samples_per_second: 147.0380
steps_per_second: 18.3800


In [8]:
import pandas as pd
div_test_df = pd.read_csv("diverse_test.csv")
div_test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1532 entries, 0 to 1531
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   code_snip  1532 non-null   object
 1   output     1532 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 24.1+ KB


In [9]:
import numpy as np
import torch
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer
)
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, matthews_corrcoef,
    cohen_kappa_score, mean_squared_error,
    mean_absolute_error, roc_auc_score
)

# ====== LOAD SAVED MODEL ======
saved_dir = "./unixcoder_nasa_finetuned"
model = AutoModelForSequenceClassification.from_pretrained(saved_dir)
tokenizer = AutoTokenizer.from_pretrained(saved_dir)
data_collator = DataCollatorWithPadding(tokenizer)

# ====== LOAD AND PROCESS NEW TEST DATA ======
# Replace with your actual new test DataFrame

div_test_df= div_test_df[div_test_df["output"].isin([0, 1])]

# Standardize column names and drop missing values
div_test_df = div_test_df.rename(columns={"code_snip": "func", "output": "label"})
div_test_df = div_test_df.dropna(subset=["func", "label"])



div_test_ds = Dataset.from_pandas(div_test_df)

# Tokenization (FUNC only, no FOL)
def tokenize_fn(batch):
    return tokenizer(
        ["FUNC: " + c for c in batch["func"]] ,
        truncation=True, padding='max_length', max_length=256
    )

div_test_tok = div_test_ds.map(tokenize_fn, batched=True)
div_test_tok = div_test_tok.rename_column("label", "labels")
div_test_tok.set_format("torch", columns=["input_ids", "attention_mask", "labels"])


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()
    probs = torch.softmax(torch.tensor(logits), dim=1).numpy()[:, 1]
    try:
        auc = roc_auc_score(labels, probs)
    except ValueError:
        auc = float('nan')
    return {
        "accuracy":    accuracy_score(labels, preds),
        "precision":   precision_score(labels, preds, zero_division=0),
        "recall":      recall_score(labels, preds, zero_division=0),
        "specificity": tn / (tn + fp) if (tn + fp) > 0 else 0,
        "fpr":         fp / (fp + tn) if (fp + tn) > 0 else 0,
        "f1":          f1_score(labels, preds, zero_division=0),
        "mcc":         matthews_corrcoef(labels, preds),
        "kappa":       cohen_kappa_score(labels, preds),
        "mse":         mean_squared_error(labels, preds),
        "mae":         mean_absolute_error(labels, preds),
        "auc":         auc
    }


trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

metrics = trainer.evaluate(eval_dataset=div_test_tok)

# ====== PRINT METRICS ======
print("\n=== Evaluation on RVL Dataset ===")
for key, value in metrics.items():
    if key.startswith("eval_"):
        print(f"{key[5:]:<12}: {value:.4f}")


Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1532/1532 [00:00<00:00, 6688.12 examples/s]
  trainer = Trainer(
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



=== Evaluation on RVL Dataset ===
loss        : 5.7048
model_preparation_time: 0.0018
accuracy    : 0.5000
precision   : 0.5000
recall      : 1.0000
specificity : 0.0000
fpr         : 1.0000
f1          : 0.6667
mcc         : 0.0000
kappa       : 0.0000
mse         : 0.5000
mae         : 0.5000
auc         : 0.5892
runtime     : 9.6280
samples_per_second: 159.1190
steps_per_second: 19.9420


In [10]:
import pandas as pd
dj_test_df = pd.read_csv("djuliet_test.csv")
dj_test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3152 entries, 0 to 3151
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   code_snip  3152 non-null   object
 1   output     3152 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 49.4+ KB


In [11]:
import numpy as np
import torch
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer
)
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, matthews_corrcoef,
    cohen_kappa_score, mean_squared_error,
    mean_absolute_error, roc_auc_score
)

# ====== LOAD SAVED MODEL ======
saved_dir = "./unixcoder_nasa_finetuned"
model = AutoModelForSequenceClassification.from_pretrained(saved_dir)
tokenizer = AutoTokenizer.from_pretrained(saved_dir)
data_collator = DataCollatorWithPadding(tokenizer)

# ====== LOAD AND PROCESS NEW TEST DATA ======
# Replace with your actual new test DataFrame

dj_test_df= dj_test_df[dj_test_df["output"].isin([0, 1])]

# Standardize column names and drop missing values
dj_test_df = dj_test_df.rename(columns={"code_snip": "func", "output": "label"})
dj_test_df = dj_test_df.dropna(subset=["func", "label"])



dj_test_ds = Dataset.from_pandas(dj_test_df)

# Tokenization (FUNC only, no FOL)
def tokenize_fn(batch):
    return tokenizer(
        ["FUNC: " + c for c in batch["func"]] ,
        truncation=True, padding='max_length', max_length=256
    )

dj_test_tok = dj_test_ds.map(tokenize_fn, batched=True)
dj_test_tok = dj_test_tok.rename_column("label", "labels")
dj_test_tok.set_format("torch", columns=["input_ids", "attention_mask", "labels"])


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()
    probs = torch.softmax(torch.tensor(logits), dim=1).numpy()[:, 1]
    try:
        auc = roc_auc_score(labels, probs)
    except ValueError:
        auc = float('nan')
    return {
        "accuracy":    accuracy_score(labels, preds),
        "precision":   precision_score(labels, preds, zero_division=0),
        "recall":      recall_score(labels, preds, zero_division=0),
        "specificity": tn / (tn + fp) if (tn + fp) > 0 else 0,
        "fpr":         fp / (fp + tn) if (fp + tn) > 0 else 0,
        "f1":          f1_score(labels, preds, zero_division=0),
        "mcc":         matthews_corrcoef(labels, preds),
        "kappa":       cohen_kappa_score(labels, preds),
        "mse":         mean_squared_error(labels, preds),
        "mae":         mean_absolute_error(labels, preds),
        "auc":         auc
    }


trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

metrics = trainer.evaluate(eval_dataset=dj_test_tok)

# ====== PRINT METRICS ======
print("\n=== Evaluation on RVL Dataset ===")
for key, value in metrics.items():
    if key.startswith("eval_"):
        print(f"{key[5:]:<12}: {value:.4f}")


Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3152/3152 [00:00<00:00, 10630.46 examples/s]
  trainer = Trainer(
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



=== Evaluation on RVL Dataset ===
loss        : 5.8866
model_preparation_time: 0.0019
accuracy    : 0.5000
precision   : 0.5000
recall      : 1.0000
specificity : 0.0000
fpr         : 1.0000
f1          : 0.6667
mcc         : 0.0000
kappa       : 0.0000
mse         : 0.5000
mae         : 0.5000
auc         : 0.4782
runtime     : 20.3466
samples_per_second: 154.9150
steps_per_second: 19.3640


In [12]:
big_test_df = pd.read_csv("big_vultest.csv")
big_test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1170 entries, 0 to 1169
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   instruction  1170 non-null   object
 1   input        1170 non-null   object
 2   output       1170 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 27.5+ KB


In [13]:
import numpy as np
import torch
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer
)
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, matthews_corrcoef,
    cohen_kappa_score, mean_squared_error,
    mean_absolute_error, roc_auc_score
)

# ====== LOAD SAVED MODEL ======
saved_dir = "./unixcoder_nasa_finetuned"
model = AutoModelForSequenceClassification.from_pretrained(saved_dir)
tokenizer = AutoTokenizer.from_pretrained(saved_dir)
data_collator = DataCollatorWithPadding(tokenizer)

# ====== LOAD AND PROCESS NEW TEST DATA ======
# Replace with your actual new test DataFrame

big_test_df= big_test_df[big_test_df["output"].isin([0, 1])]

# Standardize column names and drop missing values
big_test_df = big_test_df.rename(columns={"input": "func", "output": "label"})
big_test_df = big_test_df.dropna(subset=["func", "label"])



big_test_ds = Dataset.from_pandas(big_test_df)

# Tokenization (FUNC only, no FOL)
def tokenize_fn(batch):
    return tokenizer(
        ["FUNC: " + c for c in batch["func"]] ,
        truncation=True, padding='max_length', max_length=256
    )

big_test_tok = big_test_ds.map(tokenize_fn, batched=True)
big_test_tok = big_test_tok.rename_column("label", "labels")
big_test_tok.set_format("torch", columns=["input_ids", "attention_mask", "labels"])


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()
    probs = torch.softmax(torch.tensor(logits), dim=1).numpy()[:, 1]
    try:
        auc = roc_auc_score(labels, probs)
    except ValueError:
        auc = float('nan')
    return {
        "accuracy":    accuracy_score(labels, preds),
        "precision":   precision_score(labels, preds, zero_division=0),
        "recall":      recall_score(labels, preds, zero_division=0),
        "specificity": tn / (tn + fp) if (tn + fp) > 0 else 0,
        "fpr":         fp / (fp + tn) if (fp + tn) > 0 else 0,
        "f1":          f1_score(labels, preds, zero_division=0),
        "mcc":         matthews_corrcoef(labels, preds),
        "kappa":       cohen_kappa_score(labels, preds),
        "mse":         mean_squared_error(labels, preds),
        "mae":         mean_absolute_error(labels, preds),
        "auc":         auc
    }


trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

metrics = trainer.evaluate(eval_dataset=big_test_tok)

# ====== PRINT METRICS ======
print("\n=== Evaluation on RVL Dataset ===")
for key, value in metrics.items():
    if key.startswith("eval_"):
        print(f"{key[5:]:<12}: {value:.4f}")


Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1170/1170 [00:00<00:00, 9158.48 examples/s]
  trainer = Trainer(
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



=== Evaluation on RVL Dataset ===
loss        : 5.5969
model_preparation_time: 0.0019
accuracy    : 0.5000
precision   : 0.5000
recall      : 1.0000
specificity : 0.0000
fpr         : 1.0000
f1          : 0.6667
mcc         : 0.0000
kappa       : 0.0000
mse         : 0.5000
mae         : 0.5000
auc         : 0.6008
runtime     : 7.3537
samples_per_second: 159.1040
steps_per_second: 19.9900


In [14]:
cve_test_df = pd.read_json("test_512.json")
cve_test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4216 entries, 0 to 4215
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   instruction  4216 non-null   object
 1   input        4216 non-null   object
 2   output       4216 non-null   int64 
 3   idx          4216 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 131.9+ KB


In [15]:
import numpy as np
import torch
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer
)
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, matthews_corrcoef,
    cohen_kappa_score, mean_squared_error,
    mean_absolute_error, roc_auc_score
)

# ====== LOAD SAVED MODEL ======
saved_dir = "./unixcoder_nasa_finetuned"
model = AutoModelForSequenceClassification.from_pretrained(saved_dir)
tokenizer = AutoTokenizer.from_pretrained(saved_dir)
data_collator = DataCollatorWithPadding(tokenizer)

# ====== LOAD AND PROCESS NEW TEST DATA ======
# Replace with your actual new test DataFrame

cve_test_df= cve_test_df[cve_test_df["output"].isin([0, 1])]

# Standardize column names and drop missing values
cve_test_df = cve_test_df.rename(columns={"input": "func", "output": "label"})
cve_test_df = cve_test_df.dropna(subset=["func", "label"])



cve_test_ds = Dataset.from_pandas(cve_test_df)

# Tokenization (FUNC only, no FOL)
def tokenize_fn(batch):
    return tokenizer(
        ["FUNC: " + c for c in batch["func"]] ,
        truncation=True, padding='max_length', max_length=256
    )

cve_test_tok = cve_test_ds.map(tokenize_fn, batched=True)
cve_test_tok = cve_test_tok.rename_column("label", "labels")
cve_test_tok.set_format("torch", columns=["input_ids", "attention_mask", "labels"])


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()
    probs = torch.softmax(torch.tensor(logits), dim=1).numpy()[:, 1]
    try:
        auc = roc_auc_score(labels, probs)
    except ValueError:
        auc = float('nan')
    return {
        "accuracy":    accuracy_score(labels, preds),
        "precision":   precision_score(labels, preds, zero_division=0),
        "recall":      recall_score(labels, preds, zero_division=0),
        "specificity": tn / (tn + fp) if (tn + fp) > 0 else 0,
        "fpr":         fp / (fp + tn) if (fp + tn) > 0 else 0,
        "f1":          f1_score(labels, preds, zero_division=0),
        "mcc":         matthews_corrcoef(labels, preds),
        "kappa":       cohen_kappa_score(labels, preds),
        "mse":         mean_squared_error(labels, preds),
        "mae":         mean_absolute_error(labels, preds),
        "auc":         auc
    }


trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

metrics = trainer.evaluate(eval_dataset=cve_test_tok)

# ====== PRINT METRICS ======
print("\n=== Evaluation on RVL Dataset ===")
for key, value in metrics.items():
    if key.startswith("eval_"):
        print(f"{key[5:]:<12}: {value:.4f}")


Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4216/4216 [00:00<00:00, 9772.30 examples/s] 
  trainer = Trainer(
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



=== Evaluation on RVL Dataset ===
loss        : 5.7907
model_preparation_time: 0.0019
accuracy    : 0.5000
precision   : 0.5000
recall      : 1.0000
specificity : 0.0000
fpr         : 1.0000
f1          : 0.6667
mcc         : 0.0000
kappa       : 0.0000
mse         : 0.5000
mae         : 0.5000
auc         : 0.5163
runtime     : 26.0298
samples_per_second: 161.9680
steps_per_second: 20.2460
