In [1]:
import pandas as pd

train_df = pd.read_csv("cdl_train.csv")
test_df  = pd.read_csv("cdl_test.csv")

In [2]:
from transformers import AutoTokenizer
from datasets import Dataset
model_name = "microsoft/unixcoder-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(example):
    return tokenizer(
        example["input"],
        truncation=True,
        padding="max_length",
        max_length=256
    )
# Ensure label is integer type
train_df['label'] = train_df['label'].astype(int)
test_df['label'] = test_df['label'].astype(int)

# HuggingFace datasets
hf_train = Dataset.from_pandas(train_df[['input', 'label']])
hf_test = Dataset.from_pandas(test_df[['input', 'label']])
tokenized_train = hf_train.map(tokenize_function, batched=True)
tokenized_test = hf_test.map(tokenize_function, batched=True)

tokenized_train.set_format("torch", columns=["input_ids", "attention_mask", "label"])
tokenized_test.set_format("torch", columns=["input_ids", "attention_mask", "label"])



Map:   0%|          | 0/11224 [00:00<?, ? examples/s]

Map:   0%|          | 0/2807 [00:00<?, ? examples/s]

In [3]:
import numpy as np
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, matthews_corrcoef,
    cohen_kappa_score, mean_squared_error,
    mean_absolute_error, roc_auc_score
)
import torch

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()
    probs = torch.softmax(torch.tensor(logits), dim=1).numpy()[:, 1]
    try:
        auc = roc_auc_score(labels, probs)
    except ValueError:
        auc = float('nan')
    return {
        "accuracy":    accuracy_score(labels, preds),
        "precision":   precision_score(labels, preds, zero_division=0),
        "recall":      recall_score(labels, preds, zero_division=0),
        "specificity": tn / (tn + fp) if (tn + fp) > 0 else 0,
        "fpr":         fp / (fp + tn) if (fp + tn) > 0 else 0,
        "f1":          f1_score(labels, preds, zero_division=0),
        "mcc":         matthews_corrcoef(labels, preds),
        "kappa":       cohen_kappa_score(labels, preds),
        "mse":         mean_squared_error(labels, preds),
        "mae":         mean_absolute_error(labels, preds),
        "auc":         auc
    }


In [4]:
from transformers import TrainingArguments

training_args = TrainingArguments(
     output_dir="./cdl_grapghcodebert_model",
    eval_strategy="epoch",            # <-- fix typo!
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=5,
    per_device_train_batch_size=8,          # Increase if enough VRAM
    per_device_eval_batch_size=8,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    logging_steps=20,
    report_to="none",                       # avoid WandB complaints
    fp16=True,                              # enable mixed precision (if on Ampere or newer GPU)
    gradient_accumulation_steps=2,          # effective batch size up
    warmup_ratio=0.1,                       # or warmup_steps=500
    save_total_limit=2,                     # save only 2 best checkpoints
    seed=42,                   # avoid WandB complaints if not using it
)

In [5]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/unixcoder-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,Specificity,Fpr,F1,Mcc,Kappa,Mse,Mae,Auc
1,0.6446,0.657155,0.572497,0.5,0.7125,0.467953,0.532047,0.587629,0.182962,0.171237,0.427503,0.427503,0.65648
2,0.5763,0.588905,0.661204,0.614325,0.5575,0.738643,0.261357,0.584535,0.30066,0.299653,0.338796,0.338796,0.725222
3,0.4894,0.595145,0.665123,0.594891,0.679167,0.654636,0.345364,0.634241,0.330369,0.32792,0.334877,0.334877,0.750904
4,0.3774,0.694629,0.684717,0.651297,0.565,0.774113,0.225887,0.605087,0.347314,0.3449,0.315283,0.315283,0.761094
5,0.326,0.787721,0.677592,0.630185,0.595,0.739266,0.260734,0.612087,0.337053,0.336646,0.322408,0.322408,0.757983


TrainOutput(global_step=3510, training_loss=0.5001279069487525, metrics={'train_runtime': 591.6966, 'train_samples_per_second': 94.846, 'train_steps_per_second': 5.932, 'total_flos': 7382896213401600.0, 'train_loss': 0.5001279069487525, 'epoch': 5.0})

In [6]:
save_dir = "./cdl_unixcoder_model_final"
trainer.save_model(save_dir)  # saves model + config
tokenizer.save_pretrained(save_dir)  # saves tokenizer files

('./cdl_unixcoder_model_final\\tokenizer_config.json',
 './cdl_unixcoder_model_final\\special_tokens_map.json',
 './cdl_unixcoder_model_final\\vocab.json',
 './cdl_unixcoder_model_final\\merges.txt',
 './cdl_unixcoder_model_final\\added_tokens.json',
 './cdl_unixcoder_model_final\\tokenizer.json')

In [7]:
import torch
import pandas as pd
import numpy as np
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, matthews_corrcoef, cohen_kappa_score,
    mean_squared_error, mean_absolute_error, roc_auc_score
)

# --- Load Model & Tokenizer ---
model_dir = "./cdl_unixcoder_model_final"
model = AutoModelForSequenceClassification.from_pretrained(model_dir)
tokenizer = AutoTokenizer.from_pretrained(model_dir)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# --- Load Test Data ---
test_df = pd.read_csv("cdl_test.csv")
funcs = test_df['func'].astype(str).tolist()
true_labels = test_df['label'].astype(int).tolist()

# --- Batch Prediction ---
def batch_predict_probs(funcs, model, tokenizer, batch_size=16, max_length=256):
    all_preds = []
    all_probs = []
    for i in range(0, len(funcs), batch_size):
        batch = funcs[i:i+batch_size]
        encodings = tokenizer(batch, truncation=True, padding="max_length", max_length=max_length, return_tensors="pt")
        input_ids = encodings["input_ids"].to(device)
        attention_mask = encodings["attention_mask"].to(device)
        with torch.no_grad():
            logits = model(input_ids=input_ids, attention_mask=attention_mask).logits
            probs = torch.softmax(logits, dim=1).cpu().numpy()
            preds = np.argmax(probs, axis=1)
            all_probs.extend(probs[:, 1])
            all_preds.extend(preds)
    return np.array(all_preds), np.array(all_probs)

preds, probs = batch_predict_probs(funcs, model, tokenizer)

try:
    tn, fp, fn, tp = confusion_matrix(true_labels, preds).ravel()
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
except Exception:
    specificity, fpr = float('nan'), float('nan')

try:
    auc = roc_auc_score(true_labels, probs)
except Exception:
    auc = float('nan')

print("\n" + "="*28 + " Evaluation Metrics " + "="*28)
print(f"{'Accuracy':<15}: {accuracy_score(true_labels, preds):.4f}")
print(f"{'MCC':<15}: {matthews_corrcoef(true_labels, preds):.4f}")
print(f"{'Kappa':<15}: {cohen_kappa_score(true_labels, preds):.4f}")
print(f"{'Precision':<15}: {precision_score(true_labels, preds, zero_division=0):.4f}")
print(f"{'Recall':<15}: {recall_score(true_labels, preds, zero_division=0):.4f}")
print(f"{'F1 Score':<15}: {f1_score(true_labels, preds, zero_division=0):.4f}")
print(f"{'Specificity':<15}: {specificity:.4f}")
print(f"{'FPR':<15}: {fpr:.4f}")
print(f"{'AUC Score':<15}: {auc:.4f}")
print(f"{'MAE':<15}: {mean_absolute_error(true_labels, preds):.4f}")
print(f"{'MSE':<15}: {mean_squared_error(true_labels, preds):.4f}")
print("="*70)



Accuracy       : 0.6608
MCC            : 0.3073
Kappa          : 0.3073
Precision      : 0.6032
Recall         : 0.6042
F1 Score       : 0.6037
Specificity    : 0.7032
FPR            : 0.2968
AUC Score      : 0.7349
MAE            : 0.3392
MSE            : 0.3392
