In [None]:
import pandas as pd
train_df = pd.read_csv('train_data.csv')
test_df  = pd.read_csv('test_data.csv')

In [None]:
CODET5 without FOL

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, label_binarize
from sklearn.metrics import (
    accuracy_score, precision_recall_fscore_support,
    matthews_corrcoef, cohen_kappa_score,
    mean_squared_error, mean_absolute_error,
    confusion_matrix, roc_auc_score
)
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)

le = LabelEncoder()
train_df["label"] = le.fit_transform(train_df["target"])
test_df["label"]  = le.transform(test_df["target"])
num_labels = len(le.classes_)

train_ds = Dataset.from_pandas(train_df[["func_cleaned","label"]])
test_ds  = Dataset.from_pandas(test_df[["func_cleaned","label"]])


model_name = "Salesforce/codet5-base"
tokenizer  = AutoTokenizer.from_pretrained(model_name)
model      = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels
)

def preprocess(batch):
    return tokenizer(
        batch["func_cleaned"],
        padding="max_length",
        truncation=True,
        max_length=256
    )

train_ds = train_ds.map(preprocess, batched=True)
test_ds  = test_ds.map(preprocess,  batched=True)

train_ds.set_format("torch", columns=["input_ids","attention_mask","label"])
test_ds.set_format( "torch", columns=["input_ids","attention_mask","label"])

def compute_metrics(p):
    preds   = np.argmax(p.predictions, axis=1)
    labels  = p.label_ids
    probs   = p.predictions  # logits

    acc      = accuracy_score(labels, preds)
    prec, rec, f1, _ = precision_recall_fscore_support(
        labels, preds, average="macro", zero_division=0
    )
    mcc      = matthews_corrcoef(labels, preds)
    kappa    = cohen_kappa_score(labels, preds)
    mse      = mean_squared_error(labels, preds)
    mae      = mean_absolute_error(labels, preds)

    cm   = confusion_matrix(labels, preds)
    tp   = np.diag(cm)
    fn   = cm.sum(axis=1) - tp
    fp   = cm.sum(axis=0) - tp
    tn   = cm.sum() - (tp + fn + fp)
    spec = np.mean(tn / (tn + fp + 1e-12))
    sens = np.mean(tp / (tp + fn + 1e-12))
    fpr  = np.mean(fp / (fp + tn + 1e-12))

    lb    = label_binarize(labels, classes=list(range(num_labels)))
    try:
        auc = roc_auc_score(lb, probs, average="macro", multi_class="ovr")
    except:
        auc = float("nan")

    return {
        "Accuracy":    acc,
        "Precision":   prec,
        "Recall":      rec,
        "F1":          f1,
        "MCC":         mcc,
        "Kappa":       kappa,
        "MSE":         mse,
        "MAE":         mae,
        "Specificity": spec,
        "Sensitivity": sens,
        "FPR":         fpr,
        "AUC":         auc,
    }


training_args = TrainingArguments(
    output_dir               = "./codet5-results",
    eval_strategy            = "epoch",
    save_strategy            = "epoch",
    num_train_epochs         = 5,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size  = 8,
    learning_rate            = 2e-5,
    load_best_model_at_end   = True,
    metric_for_best_model    = "Accuracy",
)

trainer = Trainer(
    model           = model,
    args            = training_args,
    train_dataset   = train_ds,
    eval_dataset    = test_ds,
    tokenizer       = tokenizer,
    compute_metrics = compute_metrics,
    callbacks       = [EarlyStoppingCallback(early_stopping_patience=1)]
)

trainer.train()


results = trainer.evaluate()
print("\nFinal evaluation on test set:")
for k, v in results.items():
    print(f"  {k}: {v:.4f}")


With_fol

In [None]:
import pandas as pd
from datasets import Dataset
train_df = pd.read_csv('train_fol.csv')
test_df  = pd.read_csv('test_fol.csv')

for df in [train_df, test_df]:
    df['target'] = df['target'].astype(int)
    df['model_input'] = df['fol_logic'].fillna('') + " // LOGIC: " + df['func_cleaned']

# HuggingFace Datasets
train_ds = Dataset.from_pandas(train_df[['model_input', 'target']].rename(columns={'model_input': 'text', 'target': 'label'}))
test_ds  = Dataset.from_pandas(test_df[['model_input', 'target']].rename(columns={'model_input': 'text', 'target': 'label'}))


In [None]:

import numpy as np
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, matthews_corrcoef,
    cohen_kappa_score, mean_squared_error,
    mean_absolute_error, roc_auc_score
)

tokenizer = AutoTokenizer.from_pretrained('Salesforce/codet5-base')

def tokenize_fn(batch):
    return tokenizer(batch['text'], truncation=True, padding='max_length', max_length=256)

train_tok = train_ds.map(tokenize_fn, batched=True)
test_tok  = test_ds.map(tokenize_fn, batched=True)

train_tok = train_tok.rename_column('label', 'labels')
test_tok  = test_tok.rename_column('label', 'labels')
train_tok.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
test_tok.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
model = AutoModelForSequenceClassification.from_pretrained(
    'Salesforce/codet5-base',
    num_labels=2
)
model.config.problem_type = "single_label_classification"
model.config.id2label = {0: "safe", 1: "vuln"}
model.config.label2id = {"safe": 0, "vuln": 1}

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()
    probs = torch.softmax(torch.tensor(logits), dim=1).numpy()[:,1]
    try:
        auc = roc_auc_score(labels, probs)
    except ValueError:
        auc = float('nan')
    return {
        "accuracy":    accuracy_score(labels, preds),
        "precision":   precision_score(labels, preds, zero_division=0),
        "recall":      recall_score(labels, preds, zero_division=0),
        "specificity": tn/(tn+fp) if (tn+fp)>0 else 0,
        "fpr":         fp/(fp+tn) if (fp+tn)>0 else 0,
        "f1":          f1_score(labels, preds, zero_division=0),
        "mcc":         matthews_corrcoef(labels, preds),
        "kappa":       cohen_kappa_score(labels, preds),
        "mse":         mean_squared_error(labels, preds),
        "mae":         mean_absolute_error(labels, preds),
        "auc":         auc
    }


data_collator = DataCollatorWithPadding(tokenizer)
args = TrainingArguments(
    output_dir='./codet5_fol',
    eval_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    logging_steps=50
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_tok,
    eval_dataset=test_tok,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]
)


trainer.train()
metrics = trainer.evaluate()
print("\n=== Hold‐out Metrics ===")
for key, value in metrics.items():
    if key.startswith("eval_"):
        print(f"{key[5:]:<12}: {value:.4f}")


save_dir = './codet5_fol_saved'
trainer.save_model(save_dir)
tokenizer.save_pretrained(save_dir)
print(f"\nModel and tokenizer saved to {save_dir}")


SAnta coder with fol

In [None]:
import pandas as pd
import numpy as np
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, matthews_corrcoef,
    cohen_kappa_score, mean_squared_error,
    mean_absolute_error, roc_auc_score
)


train_df = pd.read_csv('train_df.csv')
test_df  = pd.read_csv('test_df.csv')

for df in [train_df, test_df]:
    df['target'] = df['target'].astype(int)
    df['model_input'] = df['fol_logic'].fillna('') + " // LOGIC: " + df['func_cleaned']

train_ds = Dataset.from_pandas(train_df[['model_input', 'target']].rename(columns={'model_input': 'text', 'target': 'label'}))
test_ds  = Dataset.from_pandas(test_df[['model_input', 'target']].rename(columns={'model_input': 'text', 'target': 'label'}))


tokenizer = AutoTokenizer.from_pretrained('bigcode/santacoder')

def tokenize_fn(batch):
    return tokenizer(batch['text'], truncation=True, padding='max_length', max_length=256)

train_tok = train_ds.map(tokenize_fn, batched=True)
test_tok  = test_ds.map(tokenize_fn, batched=True)

train_tok = train_tok.rename_column('label', 'labels')
test_tok  = test_tok.rename_column('label', 'labels')
train_tok.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
test_tok.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])


model = AutoModelForSequenceClassification.from_pretrained(
    'bigcode/santacoder',
    num_labels=2,
    trust_remote_code=True  # important for decoder-only models
)
model.config.problem_type = "single_label_classification"
model.config.id2label = {0: "safe", 1: "vuln"}
model.config.label2id = {"safe": 0, "vuln": 1}


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()
    probs = torch.softmax(torch.tensor(logits), dim=1).numpy()[:,1]
    try:
        auc = roc_auc_score(labels, probs)
    except ValueError:
        auc = float('nan')
    return {
        "accuracy":    accuracy_score(labels, preds),
        "precision":   precision_score(labels, preds, zero_division=0),
        "recall":      recall_score(labels, preds, zero_division=0),
        "specificity": tn/(tn+fp) if (tn+fp)>0 else 0,
        "fpr":         fp/(fp+tn) if (fp+tn)>0 else 0,
        "f1":          f1_score(labels, preds, zero_division=0),
        "mcc":         matthews_corrcoef(labels, preds),
        "kappa":       cohen_kappa_score(labels, preds),
        "mse":         mean_squared_error(labels, preds),
        "mae":         mean_absolute_error(labels, preds),
        "auc":         auc
    }


data_collator = DataCollatorWithPadding(tokenizer)
args = TrainingArguments(
    output_dir='./santacoder_fol',
    eval_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    logging_steps=50,
    fp16=True
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_tok,
    eval_dataset=test_tok,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]
)


trainer.train()
metrics = trainer.evaluate()
print("\n=== Hold‐out Metrics ===")
for key, value in metrics.items():
    if key.startswith("eval_"):
        print(f"{key[5:]:<12}: {value:.4f}")

save_dir = './santacoder_fol_saved'
trainer.save_model(save_dir)
tokenizer.save_pretrained(save_dir)
print(f"\nModel and tokenizer saved to {save_dir}")


without fol

In [None]:
import pandas as pd
train_df = pd.read_csv('train_data.csv')
test_df  = pd.read_csv('test_data.csv')

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, label_binarize
from sklearn.metrics import (
    accuracy_score, precision_recall_fscore_support,
    matthews_corrcoef, cohen_kappa_score,
    mean_squared_error, mean_absolute_error,
    confusion_matrix, roc_auc_score
)
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)

le = LabelEncoder()
train_df["label"] = le.fit_transform(train_df["target"])
test_df["label"]  = le.transform(test_df["target"])
num_labels = len(le.classes_)


train_ds = Dataset.from_pandas(train_df[["func_cleaned", "label"]])
test_ds  = Dataset.from_pandas(test_df[["func_cleaned", "label"]])


model_name = "bigcode/santacoder"
tokenizer  = AutoTokenizer.from_pretrained(model_name)
model      = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    trust_remote_code=True
)


def preprocess(batch):
    return tokenizer(
        batch["func_cleaned"],
        padding="max_length",
        truncation=True,
        max_length=256
    )

train_ds = train_ds.map(preprocess, batched=True)
test_ds  = test_ds.map(preprocess,  batched=True)
train_ds.set_format("torch", columns=["input_ids", "attention_mask", "label"])
test_ds.set_format( "torch", columns=["input_ids", "attention_mask", "label"])


def compute_metrics(p):
    preds   = np.argmax(p.predictions, axis=1)
    labels  = p.label_ids
    probs   = p.predictions

    acc      = accuracy_score(labels, preds)
    prec, rec, f1, _ = precision_recall_fscore_support(
        labels, preds, average="macro", zero_division=0
    )
    mcc      = matthews_corrcoef(labels, preds)
    kappa    = cohen_kappa_score(labels, preds)
    mse      = mean_squared_error(labels, preds)
    mae      = mean_absolute_error(labels, preds)

    cm   = confusion_matrix(labels, preds)
    tp   = np.diag(cm)
    fn   = cm.sum(axis=1) - tp
    fp   = cm.sum(axis=0) - tp
    tn   = cm.sum() - (tp + fn + fp)
    spec = np.mean(tn / (tn + fp + 1e-12))
    sens = np.mean(tp / (tp + fn + 1e-12))
    fpr  = np.mean(fp / (fp + tn + 1e-12))

    lb = label_binarize(labels, classes=list(range(num_labels)))
    try:
        auc = roc_auc_score(lb, probs, average="macro", multi_class="ovr")
    except:
        auc = float("nan")

    return {
        "Accuracy":    acc,
        "Precision":   prec,
        "Recall":      rec,
        "F1":          f1,
        "MCC":         mcc,
        "Kappa":       kappa,
        "MSE":         mse,
        "MAE":         mae,
        "Specificity": spec,
        "Sensitivity": sens,
        "FPR":         fpr,
        "AUC":         auc,
    }

# 6. Training setup
training_args = TrainingArguments(
    output_dir="./santacoder-results",
    eval_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=2e-5,
    load_best_model_at_end=True,
    metric_for_best_model="Accuracy",
    fp16=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]
)


trainer.train()


results = trainer.evaluate()
print("\nFinal evaluation on test set:")
for k, v in results.items():
    if isinstance(v, float):
        print(f"  {k}: {v:.4f}")
