In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import roc_curve, auc, precision_recall_curve, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import os
import ast
import numpy as np

print(os.getcwd())
df_train = pd.read_csv("../codebert/train_data.csv")
df_test = pd.read_csv("../codebert/test_data.csv")

print("Train DataFrame:")
print(df_train.head())
print("Columns:", df_train.columns.tolist())
print("\nTest DataFrame:")
print(df_test.head())
print("Columns:", df_test.columns.tolist())

In [None]:
def parse_features_list(features_str):
    cleaned_str = features_str.strip().strip('[]').replace('\n', ' ').strip()
    features_list = [float(x) for x in cleaned_str.split() if x]
    return np.array(features_list, dtype=np.float32)

X_train = np.stack(df_train['features_list_codebert'].apply(parse_features_list).values)
X_test = np.stack(df_test['features_list_codebert'].apply(parse_features_list).values)

y_train = df_train['vulnerability_exists'].values
y_test = df_test['vulnerability_exists'].values

# 轉為 PyTorch Tensor
X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)  # [N] -> [N, 1]
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32).unsqueeze(1)

# 驗證形狀
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

output_dir = "../codebert/"
torch.save(X_train, "../codebert/X_train.pt")
torch.save(X_test, "../codebert/X_test.pt")

In [None]:
class VulnScreener(nn.Module):
    def __init__(self):
        super(VulnScreener, self).__init__()
        self.mlp = nn.Sequential(
            nn.Linear(768, 256),  # Input layer to Hidden Layer 1
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 128),  # Hidden Layer 1 to Hidden Layer 2
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 1),    # Hidden Layer 2 to Output Layer
            nn.Sigmoid()          # Probability output
        )
    
    def forward(self, x):
        return self.mlp(x)  # Forward pass through the network

def compute_metrics(outputs, y_true, T):
    predictions = (outputs >= T).float()
    TP = torch.sum((predictions == 1) & (y_true == 1)).float()
    FN = torch.sum((predictions == 0) & (y_true == 1)).float()
    FP = torch.sum((predictions == 1) & (y_true == 0)).float()
    TN = torch.sum((predictions == 0) & (y_true == 0)).float()
    epsilon = 1e-7
    accuracy = (TP + TN) / (TP + TN + FP + FN + epsilon)
    precision = TP / (TP + FP + epsilon)
    recall = TP / (TP + FN + epsilon)
    f1 = 2 * (precision * recall) / (precision + recall + epsilon)
    fnr = FN / (TP + FN + epsilon)
    fpr = FP / (FP + TN + epsilon)
    return {"accuracy": accuracy.item(), "precision": precision.item(), "recall": recall.item(),
            "f1": f1.item(), "fnr": fnr.item(), "fpr": fpr.item(),"T": T}

def train_vuln_screener(X_train, y_train, epochs=220):
    model = VulnScreener()
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    T = 0.5
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        outputs = model(X_train)
        loss = criterion(outputs, y_train)
        loss.backward()
        optimizer.step()
        if epoch % 5 == 0:
            metrics = compute_metrics(outputs, y_train, T)
            fnr = metrics["fnr"]
            fpr = metrics["fpr"]
            if fnr > 0.1:
                T = max(0.3, T - 0.05)
            elif fpr > 0.3:
                T = min(0.7, T + 0.05)
            print(f"Epoch {epoch}, Loss: {loss.item():.4f}, T: {T:.2f}, Train Acc: {metrics['accuracy']:.4f}")
    return model, T

def test_vuln_screener(model, X_test, y_test, T):
    model.eval()
    with torch.no_grad():
        test_outputs = model(X_test)
        metrics = compute_metrics(test_outputs, y_test, T)
    print("\nFinal testing result:")
    for key, value in metrics.items():
        print(f"{key}: {value:.4f}")
    return test_outputs


In [None]:
# Training and testing
model, T = train_vuln_screener(X_train, y_train)
test_outputs = test_vuln_screener(model, X_test, y_test, T)

In [None]:
precision, recall, thresholds = precision_recall_curve(y_test.numpy(), test_outputs.numpy())
plt.plot(recall, precision, label="Precision-Recall curve")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve")
plt.legend()
plt.show()

In [None]:
# 選擇最佳閾值（最大化 Precision 同時保持合理 Recall）
optimal_idx = np.argmax(precision[:-1] * (recall[:-1] >= 0.9))  # 保持 Recall >= 0.9
optimal_threshold = thresholds[optimal_idx]
print(f"Optimal threshold: {optimal_threshold:.4f}")

In [None]:
# 用最佳閾值重新評估
metrics_optimal = compute_metrics(test_outputs, y_test, optimal_threshold)
print("\n最終測試結果 (最佳閾值):")
for key, value in metrics_optimal.items():
    print(f"{key}: {value:.4f}")

In [None]:
fpr, tpr, _ = roc_curve(y_test.numpy(), test_outputs.numpy())
roc_auc = auc(fpr, tpr)
plt.figure()
plt.plot(fpr, tpr, label=f"ROC curve (AUC = {roc_auc:.2f})")
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.show()

In [None]:
predictions = (test_outputs >= T).float().numpy()
cm = confusion_matrix(y_test.numpy(), predictions)
plt.figure()
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", 
            xticklabels=["Negative", "Positive"], yticklabels=["Negative", "Positive"])
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix")
plt.show()

In [None]:
torch.save(model, '../codebert/vuln_screener_model.pth')