In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
import os
import joblib
import ast
import numpy as np
import itertools
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from scipy.stats import skew, kurtosis
from imblearn.over_sampling import SMOTE
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_curve, roc_auc_score, precision_recall_curve, confusion_matrix, precision_score, f1_score, recall_score

print(os.getcwd())

# Check for MPS availability (Apple Silicon GPU)
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Training on device: {device}")

X_train = torch.load("../codebert/X_train.pt", weights_only=False).to(device)
X_test = torch.load("../codebert/X_test.pt", weights_only=False).to(device)
y_train = torch.load("../codebert/y_train.pt", weights_only=False).to(device)
y_test = torch.load("../codebert/y_test.pt", weights_only=False).to(device)

# Verify shapes
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

# Load VulnScreener and get probabilities
class VulnScreener(nn.Module):
    def __init__(self):
        super(VulnScreener, self).__init__()
        self.mlp = nn.Sequential(
            nn.Linear(768, 256),  # Input layer to Hidden Layer 1
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 128),  # Hidden Layer 1 to Hidden Layer 2
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 1),    # Hidden Layer 2 to Output Layer
            nn.Sigmoid()          # Probability output
        )
    
    def forward(self, x):
        return self.mlp(x)  # Forward pass through the network

class VulnAnalyzer(nn.Module):
    def __init__(self, dropout_rate=0.2):
        super(VulnAnalyzer, self).__init__()
        self.conv1 = nn.Sequential(
            nn.Conv1d(1, 64, kernel_size=3, padding=1),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.MaxPool1d(2)  # 769 -> 384
        )
        self.conv2 = nn.Sequential(
            nn.Conv1d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.MaxPool1d(2)  # 384 -> 192
        )
        self.conv3 = nn.Sequential(
            nn.Conv1d(128, 256, kernel_size=3, padding=1),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.MaxPool1d(2)  # 192 -> 96
        )
        # Adjust residual path to match output size of conv3 (256 channels, 96 length)
        self.residual = nn.Sequential(
            nn.Conv1d(1, 256, kernel_size=1),
            nn.AvgPool1d(kernel_size=8, stride=8)  # Downsample 769 to ~96
        )
        self.fc_layers = nn.Sequential(
            nn.Dropout(dropout_rate),
            nn.Linear(256 * 96, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(512, 9),
            nn.Sigmoid()
        )
    
    def forward(self, x, p_s):
        if p_s.dim() == 1:
            p_s = p_s.unsqueeze(1)
        x = torch.cat((x, p_s), dim=1).unsqueeze(1)  # [batch_size, 1, 769]
        residual = self.residual(x)  # [batch_size, 256, 96]
        x = self.conv1(x)  # [batch_size, 64, 384]
        x = self.conv2(x)  # [batch_size, 128, 192]
        x = self.conv3(x)  # [batch_size, 256, 96]
        # Ensure residual matches x’s size
        if residual.size(2) != x.size(2):
            residual = nn.functional.interpolate(residual, size=x.size(2), mode='nearest')
        x = x + residual  # Residual connection
        x = x.view(x.size(0), -1)  # [batch_size, 256 * 96]
        x = self.fc_layers(x)
        return x, None

screener = VulnScreener().to(device)
#analyzer = VulnAnalyzer().to(device)

screener = torch.load('../codebert/vuln_screener_model.pth', weights_only=False).to(device)
#analyzer = joblib.load('../codebert/vuln_analyzer_XGB_model.pkl')
analyzer = torch.load('../codebert/vuln_analyzer_model.pth', weights_only=False).to(device)

# 載入最佳 Thresholds
#with open("../codebert/optimal_thresholds.json", "r") as f:
#    optimal_thresholds = json.load(f)

# 確保 Thresholds 為 NumPy 陣列
#thresholds = np.array(list(optimal_thresholds.values()))


screener.eval()
analyzer.eval()

# 串接流程
with torch.no_grad():
    screener_train_prob = screener(X_train).to(device)
    screener_test_prob = screener(X_test).to(device)
    analyzer_train_prob,_ = analyzer(X_train, screener_train_prob)
    analyzer_test_prob,_ = analyzer(X_test, screener_test_prob)
    print(f"screener_train_prob: {screener_train_prob.shape}") 
    print(f"screener_test_prob: {screener_test_prob.shape}")
    print(f"analyzer_train_prob: {analyzer_train_prob.shape}")
    print(f"analyzer_test_prob: {analyzer_test_prob.shape}")

'''# 轉換 X_train / X_test 為 NumPy 並與 Screener 預測機率拼接
X_train_np = X_train.cpu().numpy()
X_test_np = X_test.cpu().numpy()

X_train_combined = np.hstack([X_train_np, screener_train_prob.cpu()])
X_test_combined = np.hstack([X_test_np, screener_test_prob.cpu()])

# XGBoost 進行預測
y_pred_train = analyzer.predict(X_train_combined)
y_pred_test = analyzer.predict(X_test_combined)

# 取得機率
analyzer_train_prob = np.array([est.predict_proba(X_train_combined)[:, 1] for est in analyzer.estimators_]).T
analyzer_test_prob = np.array([est.predict_proba(X_test_combined)[:, 1] for est in analyzer.estimators_]).T'''

/Users/rita/Documents/9309_ML/Smart_Contract_Vulnerabilities_Project/Model Training
Training on device: mps
X_train shape: torch.Size([4294, 768])
y_train shape: torch.Size([4294, 9])
X_test shape: torch.Size([1074, 768])
y_test shape: torch.Size([1074, 9])
screener_train_prob: torch.Size([4294, 1])
screener_test_prob: torch.Size([1074, 1])
analyzer_train_prob: torch.Size([4294, 9])
analyzer_test_prob: torch.Size([1074, 9])


'# 轉換 X_train / X_test 為 NumPy 並與 Screener 預測機率拼接\nX_train_np = X_train.cpu().numpy()\nX_test_np = X_test.cpu().numpy()\n\nX_train_combined = np.hstack([X_train_np, screener_train_prob.cpu()])\nX_test_combined = np.hstack([X_test_np, screener_test_prob.cpu()])\n\n# XGBoost 進行預測\ny_pred_train = analyzer.predict(X_train_combined)\ny_pred_test = analyzer.predict(X_test_combined)\n\n# 取得機率\nanalyzer_train_prob = np.array([est.predict_proba(X_train_combined)[:, 1] for est in analyzer.estimators_]).T\nanalyzer_test_prob = np.array([est.predict_proba(X_test_combined)[:, 1] for est in analyzer.estimators_]).T'

In [2]:
# Define VulnValidator without attention weights
class VulnValidator:
    def __init__(self, n_trees=500, max_depth=15, pca_components=50):
        self.rf = MultiOutputClassifier(
            RandomForestClassifier(
                n_estimators=n_trees,
                max_depth=max_depth,
                random_state=42,
                class_weight="balanced_subsample"  # Better imbalance handling
            )
        )
        self.scaler = StandardScaler()
        self.pca = PCA(n_components=pca_components)
        self.feature_importance = None
        self.feature_names = None
    
    def prepare_features(self, X, p_a, p_s):
        X = X.cpu().detach().numpy() if torch.is_tensor(X) else X
        p_a = p_a.cpu().detach().numpy() if torch.is_tensor(p_a) else p_a
        p_s = p_s.cpu().detach().numpy() if torch.is_tensor(p_s) else p_s
        
        # Use PCA on X to retain more information
        #X_pca = self.pca.fit_transform(X)
        
        # Statistical features
        stats = np.hstack([
            X.mean(axis=1, keepdims=True),
            X.var(axis=1, keepdims=True),
            X.max(axis=1, keepdims=True),
            X.min(axis=1, keepdims=True)
        ])
        
        # Interaction terms
        interaction = p_a * p_s
        
        # Combine features
        features = np.hstack([X, p_a, p_s, stats, interaction])
        features = self.scaler.fit_transform(features)
        return features
    
    def fit(self, X, p_a, p_s, y_train):
        features = self.prepare_features(X, p_a, p_s)
        y_train = y_train.cpu().detach().numpy() if torch.is_tensor(y_train) else y_train
        
        # Fit the model
        self.rf.fit(features, y_train)
        
        # Aggregate feature importances
        self.feature_importance = np.mean([est.feature_importances_ for est in self.rf.estimators_], axis=0)
        
        # Feature names
        num = X.shape[1]
        num_p_a = p_a.shape[1]
        num_p_s = p_s.shape[1]
        self.feature_names = (
            [f"X_{i}" for i in range(num)] +
            [f"Analyzer_Prob_{i}" for i in range(num_p_a)] +
            [f"Screener_Prob_{i}" for i in range(num_p_s)] +
            ["Mean", "Variance", "Max", "Min"] +
            [f"Interaction_{i}" for i in range(num_p_a)]
        )
    
    def predict(self, X, p_a, p_s):
        features = self.prepare_features(X, p_a, p_s)
        p_v = self.rf.predict_proba(features)  # List of [n_samples, 2] arrays
        p_v = np.stack([prob[:, 1] for prob in p_v], axis=1)  # [n_samples, 9]
        return p_v
    
    def generate_validation_report(self, p_f, p_v, threshold=0.05, per_sample=False):
        report = {"anomalies": [], "corrections": [], "top_features": {}}
        if per_sample:
            report = {i: {"anomalies": [], "corrections": [], "top_features": {}} for i in range(p_f.shape[0])}
    
        p_f = p_f.cpu().detach().numpy() if torch.is_tensor(p_f) else p_f
        p_v = p_v.cpu().detach().numpy() if torch.is_tensor(p_v) else p_v
    
        # Ensure shapes match
        assert p_f.shape == p_v.shape, f"Shape mismatch: p_f {p_f.shape}, p_v {p_v.shape}"
    
        for i in range(p_f.shape[0]):  # Iterate over samples
            sample_anomalies = []
            sample_corrections = []
            for j in range(p_f.shape[1]):  # Iterate over vulnerabilities
                diff = np.abs(p_f[i, j] - p_v[i, j])
                #print(f"p_f:{p_f[i,j]}")
                #print(f"p_v:{p_v[i,j]}")
                #print(f"diff:{diff}")
                if diff > threshold:
                    if p_f[i, j] > 0.5 and p_v[i, j] < 0.5:
                        sample_anomalies.append(f"Vuln {j}")
                    elif p_f[i, j] < 0.5 and p_v[i, j] > 0.5:
                        sample_corrections.append(f"Vuln {j}")
        
            if per_sample:
                report[i]["anomalies"] = sample_anomalies
                report[i]["corrections"] = sample_corrections
            else:
                report["anomalies"].extend(sample_anomalies)
                report["corrections"].extend(sample_corrections)
                
        print("Calculating feature importances...")
        # Compute top features from all estimators
        if hasattr(self.rf, "estimators_"):
            print(f"Number of estimators: {len(self.rf.estimators_)}")
            # Aggregate feature importances across all classifiers
            avg_importances = np.mean([est.feature_importances_ for est in self.rf.estimators_], axis=0)
            print(f"Feature importances shape: {avg_importances.shape}")
            print(f"Feature names length: {len(self.feature_names) if self.feature_names else 'None'}")
            top_5_idx = np.argsort(avg_importances)[-5:][::-1]
            report["top_features"] = {self.feature_names[idx]: float(avg_importances[idx]) for idx in top_5_idx}
            if per_sample:
                for i in range(p_f.shape[0]):
                    report[i]["top_features"] = report["top_features"]
    
        return report

In [3]:
def fuse_outputs(p_a, p_v):
    p_v = p_v.to(p_a.device)
    p_v = p_v.expand(-1, p_a.shape[1])
    
    p_f = 0.5 * p_a + 0.5 * p_v
    
    return p_f

def evaluate_predictions(y_true, y_prob, threshold=0.5):
    """
    評估預測結果，計算整體及各漏洞的 Precision, Recall, F1-score, AUC。

    :param y_true: 真實標籤 (numpy array) - (N, num_vulns)
    :param y_prob: 預測機率 (numpy array) - (N, num_vulns)
    :param threshold: 判定為正類別的閾值，預設為 0.5
    """
    if isinstance(y_prob, torch.Tensor):
        y_prob = y_prob.cpu().numpy()  # 轉為 NumPy，確保在 CPU 上運行
    
    if isinstance(y_true, torch.Tensor):
        y_true = y_true.cpu().numpy()  # 同樣轉換 y_true
    
    # 轉換為二元標籤
    y_pred = (y_prob > threshold).astype(int)

    # 計算 Overall 指標
    overall_precision = precision_score(y_true, y_pred, average='macro', zero_division=0)
    overall_recall = recall_score(y_true, y_pred, average='macro', zero_division=0)
    overall_f1 = f1_score(y_true, y_pred, average='macro', zero_division=0)
    overall_auc = roc_auc_score(y_true, y_prob, average='macro') if len(set(y_true.flatten())) > 1 else None

    print(f"\nOverall Precision: {overall_precision:.4f}")
    print(f"Overall Recall: {overall_recall:.4f}")
    print(f"Overall F1-Score: {overall_f1:.4f}")
    print(f"Overall AUC: {overall_auc:.4f}\n")

    # 計算各個漏洞的 Precision, Recall, F1-score, AUC
    num_vulns = y_true.shape[1]
    for i in range(num_vulns):
        precision = precision_score(y_true[:, i], y_pred[:, i], zero_division=0)
        recall = recall_score(y_true[:, i], y_pred[:, i], zero_division=0)
        f1 = f1_score(y_true[:, i], y_pred[:, i], zero_division=0)
        auc = roc_auc_score(y_true[:, i], y_prob[:, i]) if len(set(y_true[:, i])) > 1 else None

        print(f"Vuln {i}: Precision={precision:.4f}, Recall={recall:.4f}, "
              f"F1-Score={f1:.4f}, AUC={auc:.4f}")

# Validation
def evaluate_results(p_f, y_true, analyzer_prob, validator_prob):
    p_f = p_f.cpu().numpy()  # Convert to numpy for easier handling
    y_true = y_true.cpu().numpy()  # Convert to numpy for comparison
    
    for i in range(p_f.shape[0]):  # 遍歷每一筆測試資料
        print(f"\nEvaluating sample {i+1}/{p_f.shape[0]}:")

        # 這裡將單一資料的預測結果轉為二進制（0或1）
        y_pred = (p_f[i, ] > 0.5).astype(int)
        y_true_sample = y_true[i,]

        # Print the evaluation metrics for the current sample
        print(f"Predicted: {y_pred}")
        print(f"True: {y_true_sample}")

        # 生成和顯示該筆資料的報告
        report = validator.generate_validation_report(p_f[i].reshape(1, -1), validator_prob[i].reshape(1, -1))
        print(f"Anomalies(Analyzer) for sample {i}: {report['anomalies']}")
        print(f"Corrections(Validator) for sample {i}: {report['corrections']}")

        # 顯示該筆資料的前 5 個特徵
        print(f"Top 5 Features for sample {i}: {report['top_features']}")
    

In [4]:
validator = VulnValidator()
validator.fit(X_train, analyzer_train_prob, screener_train_prob, y_train)

# Get validator predictions
validator_test_prob = torch.from_numpy(validator.predict(X_test, analyzer_test_prob, screener_test_prob))
validator_test_prob = validator_test_prob.clone().detach().to(device, dtype=torch.float32)
evaluate_predictions(y_test, validator_test_prob)

# Fuse outputs
fuse_test = fuse_outputs(analyzer_test_prob, validator_test_prob)
evaluate_predictions(y_test, fuse_test)

evaluate_results(fuse_test, y_test, analyzer_test_prob, validator_test_prob)


Overall Precision: 0.8186
Overall Recall: 0.6288
Overall F1-Score: 0.7040
Overall AUC: 0.9303

Vuln 0: Precision=0.8836, Recall=0.8632, F1-Score=0.8733, AUC=0.9331
Vuln 1: Precision=0.9423, Recall=0.6806, F1-Score=0.7903, AUC=0.9663
Vuln 2: Precision=0.8000, Recall=0.4444, F1-Score=0.5714, AUC=0.9371
Vuln 3: Precision=0.8000, Recall=0.4444, F1-Score=0.5714, AUC=0.9371
Vuln 4: Precision=0.8378, Recall=0.5254, F1-Score=0.6458, AUC=0.9270
Vuln 5: Precision=0.6400, Recall=0.5766, F1-Score=0.6066, AUC=0.8692
Vuln 6: Precision=0.8656, Recall=0.7931, F1-Score=0.8278, AUC=0.9420
Vuln 7: Precision=0.7179, Recall=0.5385, F1-Score=0.6154, AUC=0.9221
Vuln 8: Precision=0.8798, Recall=0.7931, F1-Score=0.8342, AUC=0.9384

Overall Precision: 0.7699
Overall Recall: 0.6837
Overall F1-Score: 0.7204
Overall AUC: 0.9361

Vuln 0: Precision=0.8750, Recall=0.8703, F1-Score=0.8727, AUC=0.9277
Vuln 1: Precision=0.9245, Recall=0.6806, F1-Score=0.7840, AUC=0.9443
Vuln 2: Precision=0.7059, Recall=0.6667, F1-Score

In [5]:
joblib.dump(validator.rf, '../codebert/vuln_validator_model.pkl')
joblib.dump(validator.scaler, '../codebert/scaler.pkl')
joblib.dump(validator.pca, '../codebert/pca.pkl')

['../codebert/pca.pkl']