<a href="https://colab.research.google.com/github/ShamaSharma/SVD/blob/main/DevignML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
#!/usr/bin/env python3
"""
Devign Vulnerability Detection - ML Baseline with Full Metrics and Visualizations
Complete working version with all imports and error handling
"""

import os
import csv
import sys
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datasets import load_dataset
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report,
    accuracy_score,
    confusion_matrix,
    precision_recall_curve,
    average_precision_score
)
import xgboost as xgb

# Set plotting style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['font.size'] = 12

# Configuration
TRAIN_SIZE = None  # None = use full dataset, or set to integer (e.g., 5000) for subset
MAX_FEATURES = 5000
TEST_SIZE = 0.2
RANDOM_STATE = 42
RESULTS_DIR = "ml_results_full_metrics"
PLOTS_DIR = "ml_plots"

# -----------------------------
# Step 1: Load Devign dataset
# -----------------------------
def load_data(train_size=TRAIN_SIZE):
    """Load and validate dataset"""
    try:
        print("Loading Devign dataset...")
        ds = load_dataset("DetectVul/devign")

        # Determine how many samples to use
        available_samples = len(ds['train'])

        if train_size is None:
            train_size = available_samples
            print(f"Using FULL dataset: {train_size} samples")
        else:
            if available_samples < train_size:
                print(f"Warning: Only {available_samples} samples available, using all.")
                train_size = available_samples
            else:
                print(f"Using subset: {train_size} of {available_samples} samples")

        X_text = ds['train']['func'][:train_size]

        # Handle label field - it might be 'label' or 'target'
        if 'target' in ds['train'].column_names:
            y = ds['train']['target'][:train_size]
        else:
            y = ds['train']['label'][:train_size]

        # Convert labels if they're lists
        if isinstance(y[0], list):
            y = [label[0] if isinstance(label, list) else label for label in y]

        # Convert to standard Python types
        y = [int(label) for label in y]

        # Validate we have both classes
        unique_labels = set(y)
        if len(unique_labels) < 2:
            raise ValueError(f"Dataset only contains one class: {unique_labels}")

        print(f"Loaded {train_size} samples with {len(unique_labels)} classes")
        return X_text, y

    except Exception as e:
        print(f"Error loading dataset: {e}")
        import traceback
        traceback.print_exc()
        sys.exit(1)

# -----------------------------
# Step 2: Convert code to features
# -----------------------------
def vectorize_data(X_text, max_features=MAX_FEATURES):
    """Convert text to feature vectors"""
    try:
        print("Vectorizing code...")
        vectorizer = CountVectorizer(token_pattern=r'\b\w+\b', max_features=max_features)
        X = vectorizer.fit_transform(X_text)
        print(f"Feature matrix shape: {X.shape}")
        return X, vectorizer
    except Exception as e:
        print(f"Error during vectorization: {e}")
        import traceback
        traceback.print_exc()
        sys.exit(1)

# -----------------------------
# Step 3: Define ML models
# -----------------------------
def get_models():
    """Return dictionary of ML models (SVM excluded for memory efficiency)"""
    return {
        "Naive_Bayes": MultinomialNB(),
        "Logistic_Regression": LogisticRegression(max_iter=1000, random_state=RANDOM_STATE),
        "Random_Forest": RandomForestClassifier(n_estimators=50, random_state=RANDOM_STATE, n_jobs=-1),
        "XGBoost": xgb.XGBClassifier(eval_metric='logloss', random_state=RANDOM_STATE, use_label_encoder=False)
    }

# -----------------------------
# Step 4: Calculate metrics
# -----------------------------
def calculate_metrics(y_test, y_pred, y_pred_proba=None):
    """Calculate all classification metrics"""
    acc = accuracy_score(y_test, y_pred)
    report_dict = classification_report(y_test, y_pred, output_dict=True, zero_division=0)

    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)

    # Handle binary and multiclass cases
    if cm.shape == (2, 2):
        tn, fp, fn, tp = cm.ravel()
        fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
        fnr = fn / (fn + tp) if (fn + tp) > 0 else 0
    else:
        fpr = "N/A (multiclass)"
        fnr = "N/A (multiclass)"

    # Calculate average precision for PR curve
    avg_precision = None
    if y_pred_proba is not None:
        try:
            avg_precision = average_precision_score(y_test, y_pred_proba)
        except Exception as e:
            print(f"  Warning: Could not calculate average precision: {e}")
            avg_precision = None

    return acc, report_dict, cm, fpr, fnr, avg_precision

# -----------------------------
# Step 5: Save results to CSV
# -----------------------------
def save_results(name, acc, report_dict, cm, fpr, fnr, results_dir):
    """Save model results to CSV file"""
    try:
        csv_file = os.path.join(results_dir, f"{name}_metrics.csv")

        with open(csv_file, mode='w', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)

            writer.writerow(["Metric", "Value"])
            writer.writerow(["Overall Accuracy", f"{acc:.4f}"])
            writer.writerow([])

            writer.writerow(["Class-wise Metrics"])
            writer.writerow(["Class", "Precision", "Recall", "F1-Score", "Support"])

            for class_label in sorted([k for k in report_dict.keys() if k.isdigit() or k == '0']):
                if class_label in report_dict:
                    metrics = report_dict[class_label]
                    writer.writerow([
                        f"Class {class_label}",
                        f"{metrics.get('precision', 0):.4f}",
                        f"{metrics.get('recall', 0):.4f}",
                        f"{metrics.get('f1-score', 0):.4f}",
                        metrics.get('support', 0)
                    ])

            writer.writerow([])
            writer.writerow(["Aggregate Metrics"])
            for avg_type in ['macro avg', 'weighted avg']:
                if avg_type in report_dict:
                    metrics = report_dict[avg_type]
                    writer.writerow([
                        avg_type,
                        f"{metrics.get('precision', 0):.4f}",
                        f"{metrics.get('recall', 0):.4f}",
                        f"{metrics.get('f1-score', 0):.4f}",
                        metrics.get('support', 0)
                    ])

            writer.writerow([])
            writer.writerow(["Error Rates"])
            writer.writerow(["False Positive Rate (FPR)", f"{fpr:.4f}" if isinstance(fpr, float) else fpr])
            writer.writerow(["False Negative Rate (FNR)", f"{fnr:.4f}" if isinstance(fnr, float) else fnr])

            writer.writerow([])
            writer.writerow(["Confusion Matrix"])
            writer.writerow([""] + [f"Predicted {i}" for i in range(cm.shape[1])])
            for i, row in enumerate(cm):
                writer.writerow([f"Actual {i}"] + row.tolist())

        print(f"✓ {name} metrics saved to {csv_file}")

    except Exception as e:
        print(f"Error saving results for {name}: {e}")

# -----------------------------
# Step 6: Generate Visualizations
# -----------------------------
def plot_confusion_matrix(cm, model_name, plots_dir):
    """Plot confusion matrix heatmap"""
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Safe (0)', 'Vulnerable (1)'],
                yticklabels=['Safe (0)', 'Vulnerable (1)'],
                cbar_kws={'label': 'Count'})
    plt.title(f'Confusion Matrix - {model_name}', fontsize=14, fontweight='bold')
    plt.ylabel('Actual Class', fontsize=12)
    plt.xlabel('Predicted Class', fontsize=12)
    plt.tight_layout()

    filename = os.path.join(plots_dir, f'{model_name}_confusion_matrix.png')
    plt.savefig(filename, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"  ✓ Confusion matrix saved: {filename}")

def generate_visualizations(all_results, plots_dir):
    """Generate all priority visualizations"""

    model_names = list(all_results.keys())
    metrics_data = {'Accuracy': [], 'Precision': [], 'Recall': [], 'F1-Score': []}
    class_0_metrics = {'Precision': [], 'Recall': [], 'F1-Score': []}
    class_1_metrics = {'Precision': [], 'Recall': [], 'F1-Score': []}
    fpr_list = []
    fnr_list = []

    for model_name in model_names:
        result = all_results[model_name]
        report = result['report']

        metrics_data['Accuracy'].append(result['accuracy'])
        metrics_data['Precision'].append(report['weighted avg']['precision'])
        metrics_data['Recall'].append(report['weighted avg']['recall'])
        metrics_data['F1-Score'].append(report['weighted avg']['f1-score'])

        class_0_metrics['Precision'].append(report['0']['precision'])
        class_0_metrics['Recall'].append(report['0']['recall'])
        class_0_metrics['F1-Score'].append(report['0']['f1-score'])

        class_1_metrics['Precision'].append(report['1']['precision'])
        class_1_metrics['Recall'].append(report['1']['recall'])
        class_1_metrics['F1-Score'].append(report['1']['f1-score'])

        fpr_list.append(result['fpr'])
        fnr_list.append(result['fnr'])

    # Multi-Metric Bar Chart
    print("\n  Generating multi-metric comparison...")
    fig, ax = plt.subplots(figsize=(12, 7))
    x = np.arange(len(model_names))
    width = 0.2

    bars1 = ax.bar(x - 1.5*width, metrics_data['Accuracy'], width, label='Accuracy', color='#2ecc71')
    bars2 = ax.bar(x - 0.5*width, metrics_data['Precision'], width, label='Precision', color='#3498db')
    bars3 = ax.bar(x + 0.5*width, metrics_data['Recall'], width, label='Recall', color='#e74c3c')
    bars4 = ax.bar(x + 1.5*width, metrics_data['F1-Score'], width, label='F1-Score', color='#f39c12')

    ax.set_xlabel('Models', fontsize=12, fontweight='bold')
    ax.set_ylabel('Score', fontsize=12, fontweight='bold')
    ax.set_title('Model Performance Comparison - All Metrics', fontsize=14, fontweight='bold')
    ax.set_xticks(x)
    ax.set_xticklabels(model_names, rotation=15, ha='right')
    ax.legend(loc='lower right', fontsize=10)
    ax.set_ylim([0, 1.05])
    ax.grid(axis='y', alpha=0.3)

    for bars in [bars1, bars2, bars3, bars4]:
        for bar in bars:
            height = bar.get_height()
            ax.text(bar.get_x() + bar.get_width()/2., height,
                   f'{height:.3f}', ha='center', va='bottom', fontsize=8)

    plt.tight_layout()
    filename = os.path.join(plots_dir, 'multi_metric_comparison.png')
    plt.savefig(filename, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"  ✓ Multi-metric comparison saved: {filename}")

    # Precision-Recall Curves
    print("  Generating precision-recall curves...")
    plt.figure(figsize=(10, 7))

    has_curves = False
    for model_name in model_names:
        result = all_results[model_name]
        if result['pr_curve'] is not None:
            precision_vals, recall_vals, _ = result['pr_curve']
            avg_precision = result['avg_precision']
            if avg_precision is not None:
                plt.plot(recall_vals, precision_vals, linewidth=2,
                        label=f"{model_name} (AP={avg_precision:.3f})")
            else:
                plt.plot(recall_vals, precision_vals, linewidth=2, label=f"{model_name}")
            has_curves = True

    if has_curves:
        plt.xlabel('Recall', fontsize=12, fontweight='bold')
        plt.ylabel('Precision', fontsize=12, fontweight='bold')
        plt.title('Precision-Recall Curves - All Models', fontsize=14, fontweight='bold')
        plt.legend(loc='best', fontsize=10)
        plt.grid(True, alpha=0.3)
        plt.xlim([0, 1])
        plt.ylim([0, 1.05])
        plt.tight_layout()

        filename = os.path.join(plots_dir, 'precision_recall_curves.png')
        plt.savefig(filename, dpi=300, bbox_inches='tight')
        plt.close()
        print(f"  ✓ Precision-recall curves saved: {filename}")
    else:
        plt.close()
        print("  ⚠ Skipping PR curves (no probability predictions available)")

    # FPR vs FNR Comparison
    print("  Generating FPR vs FNR comparison...")
    fig, ax = plt.subplots(figsize=(12, 7))
    x = np.arange(len(model_names))
    width = 0.35

    bars1 = ax.bar(x - width/2, fpr_list, width, label='False Positive Rate (FPR)',
                   color='#e74c3c', alpha=0.8)
    bars2 = ax.bar(x + width/2, fnr_list, width, label='False Negative Rate (FNR)',
                   color='#c0392b', alpha=0.8)

    ax.set_xlabel('Models', fontsize=12, fontweight='bold')
    ax.set_ylabel('Error Rate', fontsize=12, fontweight='bold')
    ax.set_title('False Positive Rate vs False Negative Rate', fontsize=14, fontweight='bold')
    ax.set_xticks(x)
    ax.set_xticklabels(model_names, rotation=15, ha='right')
    ax.legend(fontsize=10)
    ax.set_ylim([0, max(max(fpr_list), max(fnr_list)) * 1.2])
    ax.grid(axis='y', alpha=0.3)

    for bars in [bars1, bars2]:
        for bar in bars:
            height = bar.get_height()
            ax.text(bar.get_x() + bar.get_width()/2., height,
                   f'{height:.3f}', ha='center', va='bottom', fontsize=9)

    ax.text(0.02, 0.98, '⚠️ Lower is better for both metrics\nFNR is critical for security!',
            transform=ax.transAxes, fontsize=10, verticalalignment='top',
            bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

    plt.tight_layout()
    filename = os.path.join(plots_dir, 'fpr_vs_fnr_comparison.png')
    plt.savefig(filename, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"  ✓ FPR vs FNR comparison saved: {filename}")

    # Class-wise Performance Comparison
    print("  Generating class-wise performance comparison...")
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
    x = np.arange(len(model_names))
    width = 0.25

    # Class 0 (Safe)
    ax = axes[0]
    bars1 = ax.bar(x - width, class_0_metrics['Precision'], width, label='Precision', color='#3498db')
    bars2 = ax.bar(x, class_0_metrics['Recall'], width, label='Recall', color='#2ecc71')
    bars3 = ax.bar(x + width, class_0_metrics['F1-Score'], width, label='F1-Score', color='#9b59b6')

    ax.set_xlabel('Models', fontsize=11, fontweight='bold')
    ax.set_ylabel('Score', fontsize=11, fontweight='bold')
    ax.set_title('Class 0 (Safe Code) Performance', fontsize=12, fontweight='bold')
    ax.set_xticks(x)
    ax.set_xticklabels(model_names, rotation=15, ha='right')
    ax.legend(fontsize=9)
    ax.set_ylim([0, 1.05])
    ax.grid(axis='y', alpha=0.3)

    # Class 1 (Vulnerable)
    ax = axes[1]
    bars1 = ax.bar(x - width, class_1_metrics['Precision'], width, label='Precision', color='#3498db')
    bars2 = ax.bar(x, class_1_metrics['Recall'], width, label='Recall', color='#e74c3c')
    bars3 = ax.bar(x + width, class_1_metrics['F1-Score'], width, label='F1-Score', color='#9b59b6')

    ax.set_xlabel('Models', fontsize=11, fontweight='bold')
    ax.set_ylabel('Score', fontsize=11, fontweight='bold')
    ax.set_title('Class 1 (Vulnerable Code) Performance ⚠️', fontsize=12, fontweight='bold')
    ax.set_xticks(x)
    ax.set_xticklabels(model_names, rotation=15, ha='right')
    ax.legend(fontsize=9)
    ax.set_ylim([0, 1.05])
    ax.grid(axis='y', alpha=0.3)

    plt.tight_layout()
    filename = os.path.join(plots_dir, 'classwise_performance.png')
    plt.savefig(filename, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"  ✓ Class-wise performance saved: {filename}")

    print(f"\n✅ All visualizations saved to '{plots_dir}/' directory")

# -----------------------------
# Main execution
# -----------------------------
def main():
    """Main execution function"""
    # Load data
    X_text, y = load_data(TRAIN_SIZE)

    # Vectorize
    X, vectorizer = vectorize_data(X_text, MAX_FEATURES)

    # Split data
    print("Splitting data...")
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
    )

    # Create results directory
    os.makedirs(RESULTS_DIR, exist_ok=True)
    os.makedirs(PLOTS_DIR, exist_ok=True)

    # Get models
    models = get_models()

    # Store all results for visualization
    all_results = {}

    # Train and evaluate
    print("\n" + "="*50)
    print("Training and Evaluating Models")
    print("="*50)

    for name, model in models.items():
        try:
            print(f"\nTraining {name}...")
            model.fit(X_train, y_train)

            print(f"Predicting with {name}...")
            y_pred = model.predict(X_test)

            # Get probability predictions for PR curve
            y_pred_proba = None
            try:
                if hasattr(model, "predict_proba"):
                    y_pred_proba = model.predict_proba(X_test)[:, 1]
                elif hasattr(model, "decision_function"):
                    y_pred_proba = model.decision_function(X_test)
            except Exception as e:
                print(f"  Warning: Could not get probability predictions: {e}")

            # Calculate metrics
            acc, report_dict, cm, fpr, fnr, avg_precision = calculate_metrics(y_test, y_pred, y_pred_proba)

            # Calculate PR curve data
            pr_curve = None
            if y_pred_proba is not None:
                try:
                    precision_vals, recall_vals, thresholds = precision_recall_curve(y_test, y_pred_proba)
                    pr_curve = (precision_vals, recall_vals, thresholds)
                except Exception as e:
                    print(f"  Warning: Could not calculate PR curve: {e}")

            # Store results
            all_results[name] = {
                'accuracy': acc,
                'report': report_dict,
                'confusion_matrix': cm,
                'fpr': fpr,
                'fnr': fnr,
                'pr_curve': pr_curve,
                'avg_precision': avg_precision
            }

            # Save CSV results
            save_results(name, acc, report_dict, cm, fpr, fnr, RESULTS_DIR)

            # Generate confusion matrix plot
            plot_confusion_matrix(cm, name, PLOTS_DIR)

            print(f"  Accuracy: {acc:.4f}")

        except Exception as e:
            print(f"✗ Error with {name}: {e}")
            import traceback
            traceback.print_exc()
            continue

    # Generate comparison visualizations
    if all_results:
        print("\n" + "="*50)
        print("Generating Comparison Visualizations")
        print("="*50)
        generate_visualizations(all_results, PLOTS_DIR)

    print("\n" + "="*50)
    print(f"✅ All results saved to '{RESULTS_DIR}/' directory")
    print(f"✅ All plots saved to '{PLOTS_DIR}/' directory")
    print("="*50)

if __name__ == "__main__":
    main()

Loading Devign dataset...
Using FULL dataset: 21854 samples
Loaded 21854 samples with 2 classes
Vectorizing code...
Feature matrix shape: (21854, 5000)
Splitting data...

Training and Evaluating Models

Training Naive_Bayes...
Predicting with Naive_Bayes...
✓ Naive_Bayes metrics saved to ml_results_full_metrics/Naive_Bayes_metrics.csv
  ✓ Confusion matrix saved: ml_plots/Naive_Bayes_confusion_matrix.png
  Accuracy: 0.5884

Training Logistic_Regression...


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Predicting with Logistic_Regression...
✓ Logistic_Regression metrics saved to ml_results_full_metrics/Logistic_Regression_metrics.csv
  ✓ Confusion matrix saved: ml_plots/Logistic_Regression_confusion_matrix.png
  Accuracy: 0.5795

Training Random_Forest...
Predicting with Random_Forest...
✓ Random_Forest metrics saved to ml_results_full_metrics/Random_Forest_metrics.csv
  ✓ Confusion matrix saved: ml_plots/Random_Forest_confusion_matrix.png
  Accuracy: 0.5841

Training XGBoost...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Predicting with XGBoost...
✓ XGBoost metrics saved to ml_results_full_metrics/XGBoost_metrics.csv
  ✓ Confusion matrix saved: ml_plots/XGBoost_confusion_matrix.png
  Accuracy: 0.5896

Generating Comparison Visualizations

  Generating multi-metric comparison...
  ✓ Multi-metric comparison saved: ml_plots/multi_metric_comparison.png
  Generating precision-recall curves...
  ✓ Precision-recall curves saved: ml_plots/precision_recall_curves.png
  Generating FPR vs FNR comparison...
  ✓ FPR vs FNR comparison saved: ml_plots/fpr_vs_fnr_comparison.png
  Generating class-wise performance comparison...
  ✓ Class-wise performance saved: ml_plots/classwise_performance.png

✅ All visualizations saved to 'ml_plots/' directory

✅ All results saved to 'ml_results_full_metrics/' directory
✅ All plots saved to 'ml_plots/' directory
