# 🛡️ VulnML: Vulnerability Detection Model Training
## Google Colab Training Environment

This notebook trains vulnerability detection models using your research data with GPU acceleration.

**Features:**
- 🚀 GPU-accelerated training
- 📊 Real-time metrics tracking
- 💾 Automatic model saving to Google Drive
- 🔧 Production-ready model validation

---

## 🔧 Setup & Dependencies

In [None]:
# Check GPU availability
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
else:
    print("⚠️ No GPU detected. Training will be slower on CPU.")

In [None]:
# Install required packages
!pip install -q scikit-learn pandas numpy matplotlib seaborn
!pip install -q xgboost lightgbm
!pip install -q plotly kaleido
!pip install -q joblib

print("✅ Dependencies installed successfully!")

In [None]:
# Mount Google Drive for data storage
from google.colab import drive
drive.mount('/content/drive')

# Create directories for models and results
import os
os.makedirs('/content/drive/MyDrive/VulnML_Models', exist_ok=True)
os.makedirs('/content/drive/MyDrive/VulnML_Results', exist_ok=True)

print("✅ Google Drive mounted and directories created!")

## 📦 Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
import warnings
from datetime import datetime
from pathlib import Path
import logging

# ML imports
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestRegressor, GradientBoostingClassifier, RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import (
    mean_absolute_error, r2_score, accuracy_score,
    classification_report, confusion_matrix, roc_auc_score
)
from sklearn.linear_model import Ridge, LogisticRegression
import xgboost as xgb
import joblib

# Visualization
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

warnings.filterwarnings('ignore')
plt.style.use('default')
sns.set_palette("husl")

print("✅ All libraries imported successfully!")

## 🤖 VulnML Training Class

In [None]:
class VulnMLColabTrainer:
    """Enhanced vulnerability detection model trainer optimized for Colab"""
    
    def __init__(self, drive_path='/content/drive/MyDrive/VulnML_Models'):
        self.drive_path = Path(drive_path)
        self.results_path = Path('/content/drive/MyDrive/VulnML_Results')
        
        # Setup logging
        self.logger = self._setup_logging()
        
        # Model storage
        self.models = {}
        self.scalers = {}
        self.vectorizers = {}
        self.encoders = {}
        
        # Training results
        self.training_results = {}
        
        # GPU device
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        
    def _setup_logging(self):
        """Setup logging for Colab"""
        logger = logging.getLogger('VulnML_Colab')
        logger.setLevel(logging.INFO)
        
        if not logger.handlers:
            handler = logging.StreamHandler()
            formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
            handler.setFormatter(formatter)
            logger.addHandler(handler)
            
        return logger
    
    def generate_realistic_vuln_data(self, n_samples=15000):
        """Generate realistic vulnerability dataset for training"""
        self.logger.info(f"🔧 Generating {n_samples:,} vulnerability samples...")
        
        # Enhanced vulnerability categories with realistic distributions
        vuln_categories = {
            'web_application': {
                'types': {
                    'Cross-site Scripting (XSS)': {'severity_dist': {'Low': 0.4, 'Medium': 0.4, 'High': 0.2}, 'bounty_range': (50, 5000)},
                    'SQL Injection': {'severity_dist': {'Medium': 0.3, 'High': 0.5, 'Critical': 0.2}, 'bounty_range': (500, 25000)},
                    'Cross-Site Request Forgery (CSRF)': {'severity_dist': {'Low': 0.3, 'Medium': 0.5, 'High': 0.2}, 'bounty_range': (200, 8000)},
                    'Insecure Direct Object Reference (IDOR)': {'severity_dist': {'Medium': 0.6, 'High': 0.3, 'Critical': 0.1}, 'bounty_range': (300, 15000)},
                    'Server-Side Request Forgery (SSRF)': {'severity_dist': {'High': 0.6, 'Critical': 0.4}, 'bounty_range': (2000, 50000)}
                },
                'weight': 0.45
            },
            'system_security': {
                'types': {
                    'Remote Code Execution': {'severity_dist': {'High': 0.3, 'Critical': 0.7}, 'bounty_range': (10000, 100000)},
                    'Privilege Escalation': {'severity_dist': {'Medium': 0.2, 'High': 0.5, 'Critical': 0.3}, 'bounty_range': (5000, 75000)},
                    'Authentication Bypass': {'severity_dist': {'High': 0.6, 'Critical': 0.4}, 'bounty_range': (8000, 60000)},
                    'Buffer Overflow': {'severity_dist': {'High': 0.4, 'Critical': 0.6}, 'bounty_range': (15000, 80000)}
                },
                'weight': 0.25
            },
            'blockchain_defi': {
                'types': {
                    'Flash Loan Attack': {'severity_dist': {'Critical': 1.0}, 'bounty_range': (100000, 2000000)},
                    'Reentrancy Vulnerability': {'severity_dist': {'High': 0.3, 'Critical': 0.7}, 'bounty_range': (75000, 1500000)},
                    'Price Oracle Manipulation': {'severity_dist': {'High': 0.4, 'Critical': 0.6}, 'bounty_range': (50000, 1000000)},
                    'Smart Contract Logic Error': {'severity_dist': {'Medium': 0.2, 'High': 0.5, 'Critical': 0.3}, 'bounty_range': (25000, 500000)}
                },
                'weight': 0.15
            },
            'infrastructure': {
                'types': {
                    'Information Disclosure': {'severity_dist': {'Low': 0.5, 'Medium': 0.4, 'High': 0.1}, 'bounty_range': (100, 5000)},
                    'Cryptographic Weakness': {'severity_dist': {'Medium': 0.3, 'High': 0.5, 'Critical': 0.2}, 'bounty_range': (5000, 40000)},
                    'Business Logic Bypass': {'severity_dist': {'Medium': 0.4, 'High': 0.6}, 'bounty_range': (3000, 25000)},
                    'Rate Limiting Bypass': {'severity_dist': {'Low': 0.4, 'Medium': 0.6}, 'bounty_range': (500, 8000)}
                },
                'weight': 0.15
            }
        }
        
        # Program tiers
        program_tiers = {
            'tier1': {'programs': ['Google', 'Microsoft', 'Apple', 'Meta', 'Amazon'], 'multiplier': 1.5, 'weight': 0.25},
            'tier2': {'programs': ['Netflix', 'Uber', 'PayPal', 'GitHub', 'Slack'], 'multiplier': 1.2, 'weight': 0.35},
            'tier3': {'programs': ['Shopify', 'Discord', 'Reddit', 'Spotify'], 'multiplier': 1.0, 'weight': 0.25},
            'defi': {'programs': ['Uniswap', 'Compound', 'Aave', 'MakerDAO'], 'multiplier': 2.0, 'weight': 0.15}
        }
        
        all_data = []
        
        # Generate samples for each category
        for category_name, category_data in vuln_categories.items():
            category_samples = int(n_samples * category_data['weight'])
            
            for i in range(category_samples):
                # Select vulnerability type
                vuln_type = np.random.choice(list(category_data['types'].keys()))
                vuln_config = category_data['types'][vuln_type]
                
                # Select severity based on vulnerability-specific distribution
                severities = list(vuln_config['severity_dist'].keys())
                severity_probs = list(vuln_config['severity_dist'].values())
                severity = np.random.choice(severities, p=severity_probs)
                
                # Select program tier and specific program
                tier_names = list(program_tiers.keys())
                tier_weights = [tier['weight'] for tier in program_tiers.values()]
                selected_tier = np.random.choice(tier_names, p=tier_weights)
                
                tier_data = program_tiers[selected_tier]
                program = np.random.choice(tier_data['programs'])
                
                # Calculate bounty
                base_min, base_max = vuln_config['bounty_range']
                base_bounty = np.random.uniform(base_min, base_max)
                
                # Apply program tier multiplier
                final_bounty = base_bounty * tier_data['multiplier']
                
                # Add realistic noise
                noise = np.random.uniform(0.8, 1.3)
                final_bounty *= noise
                
                # Generate additional features
                cve_score = self._generate_cve_score(severity)
                complexity = np.random.choice(['Low', 'Medium', 'High'], p=[0.3, 0.5, 0.2])
                
                record = {
                    'id': f"{category_name}_{i+1}",
                    'vulnerability_type': vuln_type,
                    'severity_level': severity,
                    'bounty_amount': round(final_bounty, 2),
                    'program_name': program,
                    'program_tier': selected_tier,
                    'category': category_name,
                    'cve_score': cve_score,
                    'complexity': complexity,
                    'description': f"{severity} {vuln_type} in {program} {category_name} system"
                }
                
                all_data.append(record)
        
        df = pd.DataFrame(all_data)
        
        # Save to drive
        csv_path = self.results_path / f"vuln_dataset_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
        df.to_csv(csv_path, index=False)
        
        self.logger.info(f"✅ Generated {len(df):,} samples")
        self.logger.info(f"💰 Bounty range: ${df['bounty_amount'].min():,.0f} - ${df['bounty_amount'].max():,.0f}")
        self.logger.info(f"📊 Severity distribution: {dict(df['severity_level'].value_counts(normalize=True).round(3))}")
        
        return df
    
    def _generate_cve_score(self, severity):
        """Generate realistic CVE scores based on severity"""
        score_ranges = {
            'Low': (0.1, 3.9),
            'Medium': (4.0, 6.9),
            'High': (7.0, 8.9),
            'Critical': (9.0, 10.0)
        }
        min_score, max_score = score_ranges.get(severity, (5.0, 7.0))
        return round(np.random.uniform(min_score, max_score), 1)
    
    def prepare_features(self, df):
        """Prepare enhanced features for training"""
        self.logger.info("🔧 Preparing enhanced features...")
        
        features = []
        
        for _, row in df.iterrows():
            # Basic text features
            vuln_type = str(row['vulnerability_type'])
            severity = str(row['severity_level'])
            program = str(row['program_name'])
            category = str(row['category'])
            complexity = str(row.get('complexity', 'Medium'))
            
            # Enhanced feature vector
            feature_vector = [
                # Text length features
                len(vuln_type),
                len(program),
                len(str(row['description'])),
                
                # CVE score
                float(row.get('cve_score', 5.0)),
                
                # Severity one-hot encoding
                1 if severity == 'Critical' else 0,
                1 if severity == 'High' else 0,
                1 if severity == 'Medium' else 0,
                1 if severity == 'Low' else 0,
                
                # Vulnerability type indicators
                1 if 'SQL' in vuln_type.upper() else 0,
                1 if 'XSS' in vuln_type.upper() else 0,
                1 if 'RCE' in vuln_type.upper() or 'REMOTE CODE' in vuln_type.upper() else 0,
                1 if 'SSRF' in vuln_type.upper() else 0,
                1 if 'IDOR' in vuln_type.upper() else 0,
                1 if 'CSRF' in vuln_type.upper() else 0,
                1 if 'FLASH LOAN' in vuln_type.upper() else 0,
                1 if 'REENTRANCY' in vuln_type.upper() else 0,
                1 if 'PRIVILEGE' in vuln_type.upper() else 0,
                1 if 'BUFFER' in vuln_type.upper() else 0,
                
                # Category indicators
                1 if category == 'web_application' else 0,
                1 if category == 'system_security' else 0,
                1 if category == 'blockchain_defi' else 0,
                1 if category == 'infrastructure' else 0,
                
                # Program tier scoring
                self._get_program_score(program),
                
                # Complexity scoring
                {'Low': 0.3, 'Medium': 0.6, 'High': 1.0}.get(complexity, 0.6),
                
                # Severity numerical score
                {'Low': 0.25, 'Medium': 0.5, 'High': 0.75, 'Critical': 1.0}.get(severity, 0.5),
                
                # Risk score based on vulnerability type
                self._get_risk_score(vuln_type)
            ]
            
            features.append(feature_vector)
        
        return np.array(features)
    
    def _get_program_score(self, program):
        """Get program reputation score"""
        tier1 = ['Google', 'Microsoft', 'Apple', 'Meta', 'Amazon']
        tier2 = ['Netflix', 'Uber', 'PayPal', 'GitHub', 'Slack']
        defi = ['Uniswap', 'Compound', 'Aave', 'MakerDAO']
        
        if program in tier1:
            return 1.0
        elif program in defi:
            return 0.9
        elif program in tier2:
            return 0.7
        else:
            return 0.5
    
    def _get_risk_score(self, vuln_type):
        """Get vulnerability risk score"""
        critical_risks = ['Remote Code Execution', 'Flash Loan Attack', 'Reentrancy']
        high_risks = ['SQL Injection', 'SSRF', 'Privilege Escalation', 'Buffer Overflow']
        medium_risks = ['XSS', 'IDOR', 'CSRF', 'Authentication Bypass']
        
        vuln_upper = vuln_type.upper()
        
        for critical in critical_risks:
            if critical.upper() in vuln_upper:
                return 1.0
        
        for high in high_risks:
            if high.upper() in vuln_upper:
                return 0.8
        
        for medium in medium_risks:
            if medium.upper() in vuln_upper:
                return 0.6
        
        return 0.4

print("✅ VulnML Trainer class defined successfully!")

## 🏋️ Training Methods

In [None]:
# Add training methods to the VulnMLColabTrainer class
def train_bounty_predictor(self, X, y):
    """Train bounty prediction models with ensemble approach"""
    self.logger.info("🤖 Training bounty prediction models...")
    
    # Feature scaling
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    self.scalers['bounty'] = scaler
    
    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=0.2, random_state=42
    )
    
    # Model ensemble
    models = {
        'random_forest': RandomForestRegressor(
            n_estimators=200,
            max_depth=12,
            min_samples_split=10,
            min_samples_leaf=4,
            random_state=42,
            n_jobs=-1
        ),
        'xgboost': xgb.XGBRegressor(
            n_estimators=200,
            max_depth=8,
            learning_rate=0.1,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42,
            tree_method='hist',
            device='cuda' if torch.cuda.is_available() else 'cpu'
        ),
        'ridge': Ridge(alpha=1.0, random_state=42)
    }
    
    best_model = None
    best_score = -np.inf
    model_results = {}
    
    for name, model in models.items():
        self.logger.info(f"  Training {name}...")
        
        # Cross-validation
        cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2', n_jobs=-1)
        
        # Train and evaluate
        model.fit(X_train, y_train)
        y_pred_test = model.predict(X_test)
        
        test_r2 = r2_score(y_test, y_pred_test)
        test_mae = mean_absolute_error(y_test, y_pred_test)
        
        model_results[name] = {
            'cv_r2_mean': cv_scores.mean(),
            'cv_r2_std': cv_scores.std(),
            'test_r2': test_r2,
            'test_mae': test_mae
        }
        
        self.logger.info(f"    CV R²: {cv_scores.mean():.3f}±{cv_scores.std():.3f}")
        self.logger.info(f"    Test R²: {test_r2:.3f}, MAE: ${test_mae:,.0f}")
        
        if cv_scores.mean() > best_score:
            best_score = cv_scores.mean()
            best_model = model
    
    self.models['bounty_predictor'] = best_model
    
    results = {
        'best_model': type(best_model).__name__,
        'cv_r2_mean': best_score,
        'model_comparison': model_results,
        'samples_count': len(X),
        'features_count': X.shape[1]
    }
    
    self.training_results['bounty_prediction'] = results
    self.logger.info(f"✅ Best bounty model: {results['best_model']} (R²={best_score:.3f})")
    
    return results

def train_severity_classifier(self, df):
    """Train severity classification model"""
    self.logger.info("🎯 Training severity classification model...")
    
    # Prepare text features
    descriptions = []
    targets = []
    
    for _, row in df.iterrows():
        description = f"{row['vulnerability_type']} vulnerability in {row['program_name']} {row['category']} system. {row['description']}"
        descriptions.append(description)
        targets.append(row['severity_level'])
    
    # Text vectorization
    vectorizer = TfidfVectorizer(
        max_features=2000,
        ngram_range=(1, 3),
        stop_words='english',
        min_df=3,
        max_df=0.95
    )
    
    X_text = vectorizer.fit_transform(descriptions)
    self.vectorizers['severity'] = vectorizer
    
    # Label encoding
    encoder = LabelEncoder()
    y = encoder.fit_transform(targets)
    self.encoders['severity'] = encoder
    
    # Stratified split
    X_train, X_test, y_train, y_test = train_test_split(
        X_text.toarray(), y, test_size=0.2, random_state=42, stratify=y
    )
    
    # Model ensemble
    models = {
        'gradient_boosting': GradientBoostingClassifier(
            n_estimators=150,
            max_depth=6,
            learning_rate=0.1,
            min_samples_split=20,
            random_state=42
        ),
        'random_forest': RandomForestClassifier(
            n_estimators=200,
            max_depth=10,
            min_samples_split=10,
            random_state=42,
            n_jobs=-1
        ),
        'xgboost': xgb.XGBClassifier(
            n_estimators=150,
            max_depth=6,
            learning_rate=0.1,
            random_state=42,
            tree_method='hist',
            device='cuda' if torch.cuda.is_available() else 'cpu'
        )
    }
    
    best_model = None
    best_score = -np.inf
    model_results = {}
    
    for name, model in models.items():
        self.logger.info(f"  Training {name}...")
        
        # Stratified cross-validation
        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        cv_scores = cross_val_score(model, X_train, y_train, cv=skf, scoring='accuracy', n_jobs=-1)
        
        # Train and evaluate
        model.fit(X_train, y_train)
        y_pred_test = model.predict(X_test)
        
        test_accuracy = accuracy_score(y_test, y_pred_test)
        
        model_results[name] = {
            'cv_accuracy_mean': cv_scores.mean(),
            'cv_accuracy_std': cv_scores.std(),
            'test_accuracy': test_accuracy
        }
        
        self.logger.info(f"    CV Accuracy: {cv_scores.mean():.3f}±{cv_scores.std():.3f}")
        self.logger.info(f"    Test Accuracy: {test_accuracy:.3f}")
        
        if cv_scores.mean() > best_score:
            best_score = cv_scores.mean()
            best_model = model
    
    self.models['severity_classifier'] = best_model
    
    # Classification report
    y_pred_final = best_model.predict(X_test)
    class_report = classification_report(
        y_test, y_pred_final,
        target_names=encoder.classes_,
        output_dict=True
    )
    
    results = {
        'best_model': type(best_model).__name__,
        'cv_accuracy_mean': best_score,
        'test_accuracy': accuracy_score(y_test, y_pred_final),
        'classification_report': class_report,
        'class_names': encoder.classes_.tolist(),
        'model_comparison': model_results
    }
    
    self.training_results['severity_classification'] = results
    self.logger.info(f"✅ Best severity model: {results['best_model']} (Acc={best_score:.3f})")
    
    return results

def save_models(self):
    """Save all trained models to Google Drive"""
    self.logger.info("💾 Saving models to Google Drive...")
    
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    
    # Save models
    for model_name, model in self.models.items():
        model_path = self.drive_path / f"{model_name}_{timestamp}.pkl"
        joblib.dump(model, model_path)
        self.logger.info(f"  Saved {model_name} to {model_path}")
    
    # Save preprocessors
    for scaler_name, scaler in self.scalers.items():
        scaler_path = self.drive_path / f"scaler_{scaler_name}_{timestamp}.pkl"
        joblib.dump(scaler, scaler_path)
    
    for vec_name, vectorizer in self.vectorizers.items():
        vec_path = self.drive_path / f"vectorizer_{vec_name}_{timestamp}.pkl"
        joblib.dump(vectorizer, vec_path)
    
    for enc_name, encoder in self.encoders.items():
        enc_path = self.drive_path / f"encoder_{enc_name}_{timestamp}.pkl"
        joblib.dump(encoder, enc_path)
    
    # Save training results
    results_path = self.results_path / f"training_results_{timestamp}.json"
    with open(results_path, 'w') as f:
        json.dump(self.training_results, f, indent=2, default=str)
    
    self.logger.info(f"✅ All models and results saved with timestamp: {timestamp}")
    return timestamp

# Add methods to the class
VulnMLColabTrainer.train_bounty_predictor = train_bounty_predictor
VulnMLColabTrainer.train_severity_classifier = train_severity_classifier
VulnMLColabTrainer.save_models = save_models

print("✅ Training methods added successfully!")

## 📊 Data Generation & Visualization

In [None]:
# Initialize trainer
trainer = VulnMLColabTrainer()

# Generate training data
print("🔧 Generating vulnerability dataset...")
df = trainer.generate_realistic_vuln_data(n_samples=15000)

print(f"\n📊 Dataset Overview:")
print(f"Total samples: {len(df):,}")
print(f"Bounty range: ${df['bounty_amount'].min():,.0f} - ${df['bounty_amount'].max():,.0f}")
print(f"Average bounty: ${df['bounty_amount'].mean():,.0f}")
print(f"\nSeverity distribution:")
print(df['severity_level'].value_counts())
print(f"\nCategory distribution:")
print(df['category'].value_counts())

In [None]:
# Create comprehensive visualizations
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=(
        'Bounty Distribution by Severity',
        'Vulnerability Types by Category',
        'Bounty Amount Distribution',
        'CVE Score vs Bounty Amount'
    ),
    specs=[[{"type": "box"}, {"type": "bar"}],
           [{"type": "histogram"}, {"type": "scatter"}]]
)

# Bounty distribution by severity
for severity in df['severity_level'].unique():
    severity_data = df[df['severity_level'] == severity]['bounty_amount']
    fig.add_trace(
        go.Box(y=severity_data, name=severity, showlegend=False),
        row=1, col=1
    )

# Vulnerability types by category
vuln_counts = df.groupby(['category', 'vulnerability_type']).size().reset_index(name='count')
fig.add_trace(
    go.Bar(
        x=vuln_counts['vulnerability_type'],
        y=vuln_counts['count'],
        text=vuln_counts['category'],
        showlegend=False
    ),
    row=1, col=2
)

# Bounty amount distribution
fig.add_trace(
    go.Histogram(x=df['bounty_amount'], nbinsx=50, showlegend=False),
    row=2, col=1
)

# CVE Score vs Bounty Amount
fig.add_trace(
    go.Scatter(
        x=df['cve_score'],
        y=df['bounty_amount'],
        mode='markers',
        text=df['severity_level'],
        showlegend=False,
        opacity=0.6
    ),
    row=2, col=2
)

fig.update_layout(
    height=800,
    title_text="VulnML Dataset Analysis",
    title_x=0.5
)

fig.update_yaxes(title_text="Bounty Amount ($)", type="log", row=1, col=1)
fig.update_xaxes(title_text="Severity", row=1, col=1)
fig.update_yaxes(title_text="Count", row=1, col=2)
fig.update_xaxes(title_text="Vulnerability Type", row=1, col=2)
fig.update_yaxes(title_text="Frequency", row=2, col=1)
fig.update_xaxes(title_text="Bounty Amount ($)", row=2, col=1)
fig.update_yaxes(title_text="Bounty Amount ($)", type="log", row=2, col=2)
fig.update_xaxes(title_text="CVE Score", row=2, col=2)

fig.show()

# Display dataset sample
print("\n📋 Sample Data:")
display(df.sample(5))

## 🔧 Feature Preparation

In [None]:
# Prepare features for training
print("🔧 Preparing features for training...")
X = trainer.prepare_features(df)
y = df['bounty_amount'].values

print(f"Feature matrix shape: {X.shape}")
print(f"Target vector shape: {y.shape}")
print(f"Feature range: [{X.min():.2f}, {X.max():.2f}]")
print(f"Target range: [${y.min():,.0f}, ${y.max():,.0f}]")

# Feature importance visualization (correlation with target)
feature_names = [
    'vuln_type_len', 'program_len', 'description_len', 'cve_score',
    'severity_critical', 'severity_high', 'severity_medium', 'severity_low',
    'vuln_sql', 'vuln_xss', 'vuln_rce', 'vuln_ssrf', 'vuln_idor', 'vuln_csrf',
    'vuln_flash_loan', 'vuln_reentrancy', 'vuln_privilege', 'vuln_buffer',
    'cat_web_app', 'cat_system_sec', 'cat_blockchain', 'cat_infrastructure',
    'program_score', 'complexity_score', 'severity_score', 'risk_score'
]

# Calculate correlations
correlations = []
for i in range(X.shape[1]):
    corr = np.corrcoef(X[:, i], y)[0, 1]
    correlations.append(abs(corr) if not np.isnan(corr) else 0)

# Feature importance plot
fig = go.Figure(data=[
    go.Bar(
        x=feature_names,
        y=correlations,
        text=[f'{c:.3f}' for c in correlations],
        textposition='auto'
    )
])

fig.update_layout(
    title='Feature Importance (Correlation with Bounty Amount)',
    xaxis_title='Features',
    yaxis_title='Absolute Correlation',
    xaxis_tickangle=-45,
    height=600
)

fig.show()

print(f"\n🔝 Top 5 most important features:")
top_features = sorted(zip(feature_names, correlations), key=lambda x: x[1], reverse=True)[:5]
for feature, corr in top_features:
    print(f"  {feature}: {corr:.3f}")

## 🏋️ Model Training

In [None]:
# Train bounty prediction model
print("🤖 Training Bounty Prediction Models...")
print("=" * 50)

bounty_results = trainer.train_bounty_predictor(X, y)

print(f"\n✅ Bounty Prediction Training Complete!")
print(f"Best Model: {bounty_results['best_model']}")
print(f"Cross-Validation R²: {bounty_results['cv_r2_mean']:.3f}")
print(f"Training Samples: {bounty_results['samples_count']:,}")
print(f"Features: {bounty_results['features_count']}")

In [None]:
# Train severity classification model
print("\n🎯 Training Severity Classification Models...")
print("=" * 50)

severity_results = trainer.train_severity_classifier(df)

print(f"\n✅ Severity Classification Training Complete!")
print(f"Best Model: {severity_results['best_model']}")
print(f"Cross-Validation Accuracy: {severity_results['cv_accuracy_mean']:.3f}")
print(f"Test Accuracy: {severity_results['test_accuracy']:.3f}")
print(f"Classes: {', '.join(severity_results['class_names'])}")

## 📈 Model Evaluation & Visualization

In [None]:
# Create comprehensive evaluation visualizations
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=(
        'Model Performance Comparison (Bounty)',
        'Model Performance Comparison (Severity)',
        'Prediction vs Actual (Sample)',
        'Training Progress Summary'
    )
)

# Bounty model comparison
bounty_models = list(bounty_results['model_comparison'].keys())
bounty_scores = [bounty_results['model_comparison'][m]['cv_r2_mean'] for m in bounty_models]

fig.add_trace(
    go.Bar(
        x=bounty_models,
        y=bounty_scores,
        name='Bounty Models',
        text=[f'{s:.3f}' for s in bounty_scores],
        textposition='auto',
        showlegend=False
    ),
    row=1, col=1
)

# Severity model comparison
severity_models = list(severity_results['model_comparison'].keys())
severity_scores = [severity_results['model_comparison'][m]['cv_accuracy_mean'] for m in severity_models]

fig.add_trace(
    go.Bar(
        x=severity_models,
        y=severity_scores,
        name='Severity Models',
        text=[f'{s:.3f}' for s in severity_scores],
        textposition='auto',
        showlegend=False
    ),
    row=1, col=2
)

# Sample predictions vs actual
# Make some sample predictions
sample_indices = np.random.choice(len(X), size=100, replace=False)
X_sample = trainer.scalers['bounty'].transform(X[sample_indices])
y_sample = y[sample_indices]
y_pred_sample = trainer.models['bounty_predictor'].predict(X_sample)

fig.add_trace(
    go.Scatter(
        x=y_sample,
        y=y_pred_sample,
        mode='markers',
        name='Predictions',
        showlegend=False,
        opacity=0.7
    ),
    row=2, col=1
)

# Perfect prediction line
min_val, max_val = min(y_sample.min(), y_pred_sample.min()), max(y_sample.max(), y_pred_sample.max())
fig.add_trace(
    go.Scatter(
        x=[min_val, max_val],
        y=[min_val, max_val],
        mode='lines',
        name='Perfect Prediction',
        line=dict(dash='dash', color='red'),
        showlegend=False
    ),
    row=2, col=1
)

# Training summary metrics
metrics = ['Bounty R²', 'Severity Acc', 'Features', 'Samples']
values = [
    bounty_results['cv_r2_mean'],
    severity_results['cv_accuracy_mean'],
    bounty_results['features_count'] / 100,  # Scale for visualization
    bounty_results['samples_count'] / 10000   # Scale for visualization
]

fig.add_trace(
    go.Bar(
        x=metrics,
        y=values,
        text=[f'{v:.3f}' if v < 10 else f'{int(v * (10000 if "Samples" in metrics[i] else 100)):,}' 
              for i, v in enumerate(values)],
        textposition='auto',
        showlegend=False
    ),
    row=2, col=2
)

fig.update_layout(
    height=800,
    title_text="VulnML Model Training Results",
    title_x=0.5
)

fig.update_yaxes(title_text="R² Score", row=1, col=1)
fig.update_yaxes(title_text="Accuracy", row=1, col=2)
fig.update_yaxes(title_text="Predicted Bounty ($)", type="log", row=2, col=1)
fig.update_xaxes(title_text="Actual Bounty ($)", type="log", row=2, col=1)
fig.update_yaxes(title_text="Normalized Values", row=2, col=2)

fig.show()

# Print detailed classification report
print("\n📊 Detailed Severity Classification Report:")
print("=" * 60)
for class_name in severity_results['class_names']:
    if class_name in severity_results['classification_report']:
        metrics = severity_results['classification_report'][class_name]
        print(f"{class_name:>10}: Precision={metrics['precision']:.3f}, Recall={metrics['recall']:.3f}, F1={metrics['f1-score']:.3f}")

## 💾 Save Models & Results

In [None]:
# Save all models and results to Google Drive
print("💾 Saving models and results to Google Drive...")
timestamp = trainer.save_models()

print(f"\n✅ All models saved successfully!")
print(f"📁 Models location: /content/drive/MyDrive/VulnML_Models/")
print(f"📁 Results location: /content/drive/MyDrive/VulnML_Results/")
print(f"🏷️ Timestamp: {timestamp}")

# Create deployment instructions
deployment_code = f'''
# VulnML Model Deployment Instructions
# Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
# Timestamp: {timestamp}

import joblib
import numpy as np

# Load models
bounty_model = joblib.load('bounty_predictor_{timestamp}.pkl')
severity_model = joblib.load('severity_classifier_{timestamp}.pkl')
scaler = joblib.load('scaler_bounty_{timestamp}.pkl')
vectorizer = joblib.load('vectorizer_severity_{timestamp}.pkl')
encoder = joblib.load('encoder_severity_{timestamp}.pkl')

# Example prediction function
def predict_vulnerability(vuln_type, program, category, cve_score=5.0, complexity='Medium'):
    # Prepare features (simplified example)
    feature_vector = [
        len(vuln_type), len(program), 100,  # lengths
        cve_score,  # CVE score
        0, 0, 1, 0,  # severity placeholders (Medium)
        1 if 'SQL' in vuln_type.upper() else 0,
        1 if 'XSS' in vuln_type.upper() else 0,
        # ... other features
    ]
    
    # Predict bounty
    features_scaled = scaler.transform([feature_vector])
    predicted_bounty = bounty_model.predict(features_scaled)[0]
    
    # Predict severity
    description = f"{{vuln_type}} vulnerability in {{program}} {{category}} system"
    text_features = vectorizer.transform([description])
    severity_encoded = severity_model.predict(text_features.toarray())[0]
    predicted_severity = encoder.inverse_transform([severity_encoded])[0]
    
    return {{
        'predicted_bounty': predicted_bounty,
        'predicted_severity': predicted_severity
    }}

# Example usage
result = predict_vulnerability(
    vuln_type="SQL Injection",
    program="Google",
    category="web_application",
    cve_score=8.5
)
print(f"Predicted bounty: ${{result['predicted_bounty']:,.0f}}")
print(f"Predicted severity: {{result['predicted_severity']}}")
'''

# Save deployment instructions
deploy_path = trainer.results_path / f"deployment_instructions_{timestamp}.py"
with open(deploy_path, 'w') as f:
    f.write(deployment_code)

print(f"\n📋 Deployment instructions saved: {deploy_path}")

# Display final summary
print(f"\n🎉 VulnML Training Complete!")
print(f"=" * 50)
print(f"🤖 Bounty Predictor: {bounty_results['best_model']} (R²={bounty_results['cv_r2_mean']:.3f})")
print(f"🎯 Severity Classifier: {severity_results['best_model']} (Acc={severity_results['cv_accuracy_mean']:.3f})")
print(f"📊 Training Samples: {bounty_results['samples_count']:,}")
print(f"🔧 Features: {bounty_results['features_count']}")
print(f"💾 Models saved to Google Drive with timestamp: {timestamp}")
print(f"\n🚀 Ready for deployment!")

## 🧪 Quick Model Test

In [None]:
# Test the trained models with some example vulnerabilities
print("🧪 Testing trained models with example vulnerabilities...")
print("=" * 60)

test_cases = [
    {
        'name': 'Critical RCE in Google',
        'vuln_type': 'Remote Code Execution',
        'program': 'Google',
        'category': 'system_security',
        'cve_score': 9.8,
        'expected_bounty_range': (50000, 150000)
    },
    {
        'name': 'SQL Injection in PayPal',
        'vuln_type': 'SQL Injection',
        'program': 'PayPal',
        'category': 'web_application',
        'cve_score': 7.5,
        'expected_bounty_range': (5000, 25000)
    },
    {
        'name': 'Flash Loan Attack in Uniswap',
        'vuln_type': 'Flash Loan Attack',
        'program': 'Uniswap',
        'category': 'blockchain_defi',
        'cve_score': 9.5,
        'expected_bounty_range': (200000, 800000)
    },
    {
        'name': 'XSS in Medium Website',
        'vuln_type': 'Cross-site Scripting (XSS)',
        'program': 'Reddit',
        'category': 'web_application',
        'cve_score': 4.3,
        'expected_bounty_range': (100, 3000)
    }
]

successful_predictions = 0

for test_case in test_cases:
    try:
        # Create feature vector for this test case
        feature_vector = [
            len(test_case['vuln_type']),
            len(test_case['program']),
            100,  # description length
            test_case['cve_score'],
            1 if 'Critical' in test_case['name'] else 0,
            1 if 'High' in test_case['name'] else 0,
            1 if 'Medium' in test_case['name'] else 0,
            1 if 'Low' in test_case['name'] else 0,
            1 if 'SQL' in test_case['vuln_type'].upper() else 0,
            1 if 'XSS' in test_case['vuln_type'].upper() else 0,
            1 if 'RCE' in test_case['vuln_type'].upper() or 'REMOTE CODE' in test_case['vuln_type'].upper() else 0,
            0, 0, 0,  # SSRF, IDOR, CSRF
            1 if 'FLASH LOAN' in test_case['vuln_type'].upper() else 0,
            0, 0, 0,  # reentrancy, privilege, buffer
            1 if test_case['category'] == 'web_application' else 0,
            1 if test_case['category'] == 'system_security' else 0,
            1 if test_case['category'] == 'blockchain_defi' else 0,
            1 if test_case['category'] == 'infrastructure' else 0,
            trainer._get_program_score(test_case['program']),
            0.6,  # complexity score
            0.75,  # severity score
            trainer._get_risk_score(test_case['vuln_type'])
        ]
        
        # Predict bounty
        features_scaled = trainer.scalers['bounty'].transform([feature_vector])
        predicted_bounty = trainer.models['bounty_predictor'].predict(features_scaled)[0]
        
        # Predict severity
        description = f"{test_case['vuln_type']} vulnerability in {test_case['program']} {test_case['category']} system"
        text_features = trainer.vectorizers['severity'].transform([description])
        severity_encoded = trainer.models['severity_classifier'].predict(text_features.toarray())[0]
        predicted_severity = trainer.encoders['severity'].inverse_transform([severity_encoded])[0]
        
        # Check if prediction is within expected range
        expected_min, expected_max = test_case['expected_bounty_range']
        within_range = expected_min <= predicted_bounty <= expected_max
        
        if within_range:
            successful_predictions += 1
            status = "✅"
        else:
            status = "❌"
        
        print(f"{status} {test_case['name']}:")
        print(f"   Predicted Bounty: ${predicted_bounty:,.0f}")
        print(f"   Expected Range: ${expected_min:,} - ${expected_max:,}")
        print(f"   Predicted Severity: {predicted_severity}")
        print()
        
    except Exception as e:
        print(f"❌ {test_case['name']}: Prediction failed - {e}")
        print()

accuracy = successful_predictions / len(test_cases)
print(f"🎯 Test Accuracy: {accuracy:.1%} ({successful_predictions}/{len(test_cases)})")

if accuracy >= 0.75:
    print("🟢 Models are performing well and ready for production!")
elif accuracy >= 0.5:
    print("🟡 Models show promise but may need fine-tuning")
else:
    print("🔴 Models need significant improvement before deployment")

print("\n🎉 VulnML Colab Training Session Complete!")