# üõ°Ô∏è Mosaic Protocol - Smart Contract Vulnerability Classifier V2

**Dataset:** SmartBugs Curated (248 samples)
**Features:** 55 hybrid Slither-ML features
**Model:** XGBoost with SMOTE balancing

---

## 1Ô∏è‚É£ Setup & Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
OUTPUT_DIR = '/content/drive/MyDrive/mosaic-ml'
os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"‚úÖ Output directory: {OUTPUT_DIR}")

In [None]:
# Install dependencies
!pip install xgboost scikit-learn imbalanced-learn onnxmltools onnx onnxruntime matplotlib seaborn --quiet
print("‚úÖ Dependencies installed")

## 2Ô∏è‚É£ Load SmartBugs Training Data

In [None]:
import json
import gzip
import numpy as np
from pathlib import Path

DATA_DIR = '/content/drive/MyDrive/mosaic-ml'

# Find smartbugs training file
data_files = list(Path(DATA_DIR).glob('smartbugs_training_*.json*'))
if not data_files:
    # Fallback to old format
    data_files = list(Path(DATA_DIR).glob('training_data_*.json*'))

if not data_files:
    raise FileNotFoundError(f"No training data in {DATA_DIR}")

data_file = sorted(data_files)[-1]
print(f"üìÇ Loading: {data_file.name}")

if str(data_file).endswith('.gz'):
    with gzip.open(data_file, 'rt', encoding='utf-8') as f:
        data = json.load(f)
else:
    with open(data_file, 'r') as f:
        data = json.load(f)

print(f"""\nüìä Dataset Summary:
   Total samples: {data['metadata']['totalSamples']:,}
   Features: {data['metadata']['featureCount']}
   Labels: {data['metadata']['labelDistribution']}
""")

if 'vulnerabilityTypes' in data['metadata']:
    print("üè∑Ô∏è Vulnerability types:")
    for vtype, count in sorted(data['metadata']['vulnerabilityTypes'].items(), key=lambda x: -x[1])[:5]:
        print(f"   {vtype}: {count}")

In [None]:
# Convert to numpy arrays
def to_arrays(samples):
    X = np.array([s['features'] for s in samples], dtype=np.float32)
    y = np.array([s['label'] for s in samples], dtype=np.int32)
    return X, y

X_train, y_train = to_arrays(data['train'])
X_val, y_val = to_arrays(data['validation'])
X_test, y_test = to_arrays(data['test'])

print(f"""üìê Array shapes:
   Train: {X_train.shape} (safe: {(y_train==0).sum()}, vuln: {(y_train==1).sum()})
   Val:   {X_val.shape} (safe: {(y_val==0).sum()}, vuln: {(y_val==1).sum()})
   Test:  {X_test.shape} (safe: {(y_test==0).sum()}, vuln: {(y_test==1).sum()})
""")

feature_names = data['metadata']['featureNames']
print(f"Feature names ({len(feature_names)}): {feature_names[:5]}...")

## 3Ô∏è‚É£ Handle Class Imbalance with SMOTE

In [None]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE to training data only
print("‚öñÔ∏è Before SMOTE:")
print(f"   Safe: {(y_train==0).sum()}, Vulnerable: {(y_train==1).sum()}")

smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

print("\n‚úÖ After SMOTE:")
print(f"   Safe: {(y_train_balanced==0).sum()}, Vulnerable: {(y_train_balanced==1).sum()}")

## 4Ô∏è‚É£ Train XGBoost Model

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_recall_curve
import matplotlib.pyplot as plt
import seaborn as sns

# Train model on balanced data
model = XGBClassifier(
    n_estimators=100,
    max_depth=4,
    learning_rate=0.1,
    eval_metric='aucpr',
    use_label_encoder=False,
    random_state=42,
    verbosity=1
)

print("üöÄ Training XGBoost on SMOTE-balanced data...")
model.fit(
    X_train_balanced, y_train_balanced,
    eval_set=[(X_val, y_val)],
    verbose=20
)
print("‚úÖ Training complete!")

## 5Ô∏è‚É£ Evaluate Model

In [None]:
# Get predictions
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

print("üìä Classification Report (default threshold=0.5):")
print(classification_report(y_test, y_pred, target_names=['Safe', 'Vulnerable']))

roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"üéØ ROC-AUC Score: {roc_auc:.4f}")

In [None]:
# Confusion Matrix
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Safe', 'Vulnerable'],
            yticklabels=['Safe', 'Vulnerable'])
plt.title('Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.tight_layout()
plt.savefig(f"{OUTPUT_DIR}/confusion_matrix_v2.png")
plt.show()

In [None]:
# Optimize threshold for high recall
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)

target_recall = 0.85
idx = np.where(recall >= target_recall)[0]
if len(idx) > 0:
    best_idx = idx[-1]
    optimal_threshold = thresholds[best_idx] if best_idx < len(thresholds) else 0.5
    print(f"üéØ For {target_recall:.0%} recall on vulnerabilities:")
    print(f"   Optimal threshold: {optimal_threshold:.3f}")
    print(f"   Precision: {precision[best_idx]:.3f}")
else:
    optimal_threshold = 0.3
    best_idx = 0
    print(f"‚ö†Ô∏è Using default threshold: {optimal_threshold}")

y_pred_opt = (y_pred_proba >= optimal_threshold).astype(int)
print("\nüìä Classification Report (optimized threshold):")
print(classification_report(y_test, y_pred_opt, target_names=['Safe', 'Vulnerable']))

## 6Ô∏è‚É£ Feature Importance

In [None]:
importances = model.feature_importances_
indices = np.argsort(importances)[::-1][:15]

plt.figure(figsize=(12, 6))
plt.bar(range(15), importances[indices])
plt.xticks(range(15), [feature_names[i] for i in indices], rotation=45, ha='right')
plt.title('Top 15 Most Important Features')
plt.tight_layout()
plt.savefig(f"{OUTPUT_DIR}/feature_importance_v2.png")
plt.show()

print("\nüîù Top 10 Features:")
for i, idx_feat in enumerate(indices[:10]):
    print(f"   {i+1}. {feature_names[idx_feat]}: {importances[idx_feat]:.4f}")

## 7Ô∏è‚É£ Save Model (Pickle - Simple & Works)

In [None]:
import joblib

# Save as pickle (most reliable)
model_path = f"{OUTPUT_DIR}/vulnerability_classifier_v2.pkl"
joblib.dump(model, model_path)
print(f"‚úÖ Saved model: {model_path}")

# Save metadata
model_metadata = {
    'model_name': 'vulnerability_classifier_v2',
    'model_type': 'XGBoost',
    'created_at': str(np.datetime64('now')),
    'num_features': len(feature_names),
    'feature_names': feature_names,
    'classes': ['safe', 'vulnerable'],
    'optimal_threshold': float(optimal_threshold),
    'metrics': {
        'roc_auc': float(roc_auc),
        'accuracy': float((y_pred == y_test).mean()),
        'recall_vulnerable': float(recall[best_idx]) if len(idx) > 0 else 0,
        'precision_vulnerable': float(precision[best_idx]) if len(idx) > 0 else 0,
    },
    'training_samples': int(len(y_train_balanced)),
}

metadata_path = f"{OUTPUT_DIR}/model_metadata_v2.json"
with open(metadata_path, 'w') as f:
    json.dump(model_metadata, f, indent=2)
print(f"‚úÖ Saved metadata: {metadata_path}")

print("\nüìã Model Metadata:")
for k, v in model_metadata.items():
    if k not in ['feature_names']:
        print(f"   {k}: {v}")

## ‚úÖ Training Complete!

**Files saved:**
- `vulnerability_classifier_v2.pkl` - XGBoost model
- `model_metadata_v2.json` - Configuration
- `confusion_matrix_v2.png`
- `feature_importance_v2.png`

**Next:** Download files and place in `backend/src/agents/defi-safety/ml/models/`

In [None]:
print("\nüìÅ Output files:")
for f in os.listdir(OUTPUT_DIR):
    if 'v2' in f or 'smartbugs' in f:
        size = os.path.getsize(f"{OUTPUT_DIR}/{f}") / 1024
        print(f"   {f} ({size:.1f} KB)")