# Risk Predictor - Model Evaluation and Monitoring

This notebook focuses on comprehensive model evaluation, performance monitoring, and production readiness assessment.

## Objectives:
- Load and evaluate trained models
- Performance analysis and validation
- Model interpretability and explainability
- Production monitoring setup
- API integration testing

## 1. Import Libraries and Load Model

In [None]:
# Core libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import joblib

# Model evaluation
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import roc_curve, auc, precision_recall_curve
from sklearn.model_selection import learning_curve

# Model interpretation
try:
    import shap
    SHAP_AVAILABLE = True
except ImportError:
    SHAP_AVAILABLE = False
    print('SHAP not available. Install with: pip install shap')

# API testing
import requests
import time

# Utilities
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

print('Libraries imported successfully!')

In [None]:
# Load the trained model and components
model_path = '/home/jovyan/data/models/'

try:
    # Load model components
    model = joblib.load(f'{model_path}risk_predictor_model.pkl')
    label_encoder = joblib.load(f'{model_path}label_encoder.pkl')
    
    # Load metadata
    with open(f'{model_path}model_metadata.json', 'r') as f:
        metadata = json.load(f)
    
    print('Model loaded successfully!')
    print(f'Model type: {metadata["model_type"]}')
    print(f'Training accuracy: {metadata["accuracy"]:.4f}')
    print(f'Feature count: {len(metadata["feature_names"])}')
    print(f'Target classes: {metadata["target_classes"]}')
    
except FileNotFoundError as e:
    print(f'Model files not found: {e}')
    print('Please run the model development notebook first to train and save the model.')

## 2. Load Test Data for Evaluation

In [None]:
# Create or load test dataset
np.random.seed(123)  # Different seed for test data
n_test = 1000

test_df = pd.DataFrame({
    'age': np.random.randint(18, 80, n_test),
    'income': np.random.lognormal(10.5, 0.8, n_test),
    'credit_score': np.random.normal(650, 100, n_test).clip(300, 850),
    'employment_length': np.random.exponential(5, n_test).clip(0, 40),
    'loan_amount': np.random.lognormal(10, 0.7, n_test),
    'debt_to_income': np.random.beta(2, 5, n_test) * 100,
    'previous_defaults': np.random.poisson(0.3, n_test),
    'education_level': np.random.choice(['High School', 'Bachelor', 'Master', 'PhD'], 
                                      n_test, p=[0.4, 0.35, 0.2, 0.05])
})

# Create true risk levels
risk_score = (
    -0.01 * test_df['age'] +
    -0.00001 * test_df['income'] +
    -0.005 * test_df['credit_score'] +
    -0.02 * test_df['employment_length'] +
    0.00002 * test_df['loan_amount'] +
    0.02 * test_df['debt_to_income'] +
    0.5 * test_df['previous_defaults'] +
    np.random.normal(0, 0.5, n_test)
)

test_df['true_risk_level'] = pd.cut(risk_score, 
                                   bins=[-np.inf, -0.5, 0.5, np.inf], 
                                   labels=['Low', 'Medium', 'High'])

print(f'Test dataset created: {test_df.shape}')
print(f'True risk distribution:\n{test_df["true_risk_level"].value_counts()}')

## 3. Model Predictions and Performance

In [None]:
# Make predictions on test data
try:
    X_test = test_df.drop('true_risk_level', axis=1)
    y_true = label_encoder.transform(test_df['true_risk_level'])
    
    # Predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)
    
    # Convert back to labels for display
    y_pred_labels = label_encoder.inverse_transform(y_pred)
    y_true_labels = test_df['true_risk_level']
    
    print('Predictions completed successfully!')
    print(f'Predicted risk distribution:\n{pd.Series(y_pred_labels).value_counts()}')
    
except Exception as e:
    print(f'Error making predictions: {e}')

In [None]:
# Performance metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='weighted')
recall = recall_score(y_true, y_pred, average='weighted')
f1 = f1_score(y_true, y_pred, average='weighted')

print('=== MODEL PERFORMANCE ON TEST DATA ===')
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1-Score: {f1:.4f}')

# Detailed classification report
print('\nDetailed Classification Report:')
print(classification_report(y_true, y_pred, target_names=label_encoder.classes_))

In [None]:
# Confusion matrix visualization
cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=label_encoder.classes_, 
            yticklabels=label_encoder.classes_)
plt.title('Confusion Matrix - Test Data')
plt.xlabel('Predicted Risk Level')
plt.ylabel('True Risk Level')
plt.show()

# Calculate per-class accuracy
class_accuracies = cm.diagonal() / cm.sum(axis=1)
print('\nPer-class Accuracy:')
for i, class_name in enumerate(label_encoder.classes_):
    print(f'{class_name}: {class_accuracies[i]:.4f}')

## 4. ROC Curves and AUC Analysis

In [None]:
# ROC curves for each class (one-vs-rest)
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve, auc
from itertools import cycle

# Binarize the output
n_classes = len(label_encoder.classes_)
y_test_bin = label_binarize(y_true, classes=range(n_classes))

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()

for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_pred_proba[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Plot ROC curves
plt.figure(figsize=(10, 8))
colors = cycle(['blue', 'red', 'green', 'orange', 'purple'])

for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=2,
             label=f'ROC curve of class {label_encoder.classes_[i]} (AUC = {roc_auc[i]:.2f})')

plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curves')
plt.legend(loc='lower right')
plt.grid(alpha=0.3)
plt.show()

# Print AUC scores
print('AUC Scores by Class:')
for i, class_name in enumerate(label_encoder.classes_):
    print(f'{class_name}: {roc_auc[i]:.4f}')

## 5. Model Interpretability (SHAP Analysis)

In [None]:
# SHAP analysis for model interpretability
if SHAP_AVAILABLE:
    try:
        # Create SHAP explainer
        explainer = shap.Explainer(model.predict, X_test.iloc[:100])  # Use subset for speed
        shap_values = explainer(X_test.iloc[:100])
        
        # Summary plot
        plt.figure(figsize=(10, 8))
        shap.summary_plot(shap_values, X_test.iloc[:100], show=False)
        plt.title('SHAP Summary Plot - Feature Importance')
        plt.tight_layout()
        plt.show()
        
        print('SHAP analysis completed successfully!')
        
    except Exception as e:
        print(f'SHAP analysis failed: {e}')
        print('This might be due to model type compatibility or data format issues.')
else:
    print('SHAP not available. Install with: pip install shap')
    print('Alternative: Using basic feature importance from tree-based models')
    
    # Alternative: Feature importance for tree-based models
    if hasattr(model.named_steps['classifier'], 'feature_importances_'):
        importances = model.named_steps['classifier'].feature_importances_
        feature_names = X_test.columns
        
        # Create feature importance plot
        indices = np.argsort(importances)[::-1][:15]
        plt.figure(figsize=(10, 8))
        plt.bar(range(len(indices)), importances[indices])
        plt.xticks(range(len(indices)), [feature_names[i] for i in indices], rotation=45)
        plt.title('Feature Importance (Top 15)')
        plt.tight_layout()
        plt.show()

## 6. Model Prediction Examples

In [None]:
# Show prediction examples with probabilities
sample_indices = np.random.choice(len(X_test), 10, replace=False)

print('=== PREDICTION EXAMPLES ===')
print('Sample predictions with confidence scores:\n')

for idx in sample_indices:
    sample = X_test.iloc[idx:idx+1]
    pred_proba = model.predict_proba(sample)[0]
    pred_class = model.predict(sample)[0]
    pred_label = label_encoder.inverse_transform([pred_class])[0]
    true_label = y_true_labels.iloc[idx]
    
    print(f'Sample {idx}:')
    print(f'  Age: {sample["age"].iloc[0]}, Income: ${sample["income"].iloc[0]:,.0f}')
    print(f'  Credit Score: {sample["credit_score"].iloc[0]:.0f}, Debt-to-Income: {sample["debt_to_income"].iloc[0]:.1f}%')
    print(f'  Predicted: {pred_label} (confidence: {max(pred_proba):.3f})')
    print(f'  True: {true_label}')
    print(f'  Class probabilities: {dict(zip(label_encoder.classes_, pred_proba))}')
    print(f'  Correct: {"✓" if pred_label == true_label else "✗"}')
    print()

## 7. API Integration Testing

In [None]:
# Test API endpoint (if available)
API_BASE_URL = 'http://localhost:5001'  # Updated port

def test_api_health():
    """Test if the API is running"""
    try:
        response = requests.get(f'{API_BASE_URL}/api/health', timeout=5)
        if response.status_code == 200:
            print('✓ API is running and healthy')
            return True
        else:
            print(f'✗ API returned status code: {response.status_code}')
            return False
    except requests.exceptions.RequestException as e:
        print(f'✗ API connection failed: {e}')
        return False

def test_prediction_endpoint(sample_data):
    """Test prediction endpoint with sample data"""
    try:
        response = requests.post(
            f'{API_BASE_URL}/api/predict',
            json=sample_data,
            timeout=10
        )
        if response.status_code == 200:
            result = response.json()
            print('✓ Prediction endpoint working')
            print(f'  Response: {result}')
            return True
        else:
            print(f'✗ Prediction failed with status: {response.status_code}')
            return False
    except requests.exceptions.RequestException as e:
        print(f'✗ Prediction request failed: {e}')
        return False

# Run API tests
print('=== API INTEGRATION TESTS ===')
api_healthy = test_api_health()

if api_healthy:
    # Test with sample data
    sample_request = {
        'age': 35,
        'income': 75000,
        'credit_score': 720,
        'employment_length': 8,
        'loan_amount': 30000,
        'debt_to_income': 25.5,
        'previous_defaults': 0,
        'education_level': 'Bachelor'
    }
    
    test_prediction_endpoint(sample_request)
else:
    print('Skipping prediction tests - API not available')
    print('Note: Make sure your Flask backend is running on port 5001')

## 8. Performance Monitoring Setup

In [None]:
# Create monitoring functions
def log_prediction(input_data, prediction, confidence, timestamp=None):
    """Log prediction for monitoring"""
    if timestamp is None:
        timestamp = datetime.now().isoformat()
    
    log_entry = {
        'timestamp': timestamp,
        'input_data': input_data,
        'prediction': prediction,
        'confidence': confidence,
        'model_version': metadata.get('model_type', 'unknown')
    }
    
    return log_entry

def calculate_model_drift(reference_data, new_data, threshold=0.05):
    """Simple data drift detection"""
    drift_scores = {}
    
    for column in reference_data.select_dtypes(include=[np.number]).columns:
        # Kolmogorov-Smirnov test
        from scipy.stats import ks_2samp
        statistic, p_value = ks_2samp(reference_data[column], new_data[column])
        
        drift_scores[column] = {
            'ks_statistic': statistic,
            'p_value': p_value,
            'drift_detected': p_value < threshold
        }
    
    return drift_scores

# Example usage
print('=== MONITORING SETUP ===')
print('Monitoring functions created:')
print('- log_prediction(): For logging predictions')
print('- calculate_model_drift(): For detecting data drift')

# Example prediction logging
sample_log = log_prediction(
    input_data={'age': 30, 'income': 50000},
    prediction='Medium',
    confidence=0.85
)
print(f'\nSample log entry: {sample_log}')

## 9. Model Quality Report

In [None]:
# Generate comprehensive model quality report
def generate_model_report():
    report = {
        'model_info': {
            'model_type': metadata.get('model_type', 'Unknown'),
            'training_date': datetime.now().strftime('%Y-%m-%d'),
            'feature_count': len(metadata.get('feature_names', [])),
            'target_classes': metadata.get('target_classes', []),
        },
        'performance_metrics': {
            'accuracy': float(accuracy),
            'precision': float(precision),
            'recall': float(recall),
            'f1_score': float(f1),
        },
        'class_performance': {},
        'recommendations': []
    }
    
    # Add per-class performance
    for i, class_name in enumerate(label_encoder.classes_):
        report['class_performance'][class_name] = {
            'accuracy': float(class_accuracies[i]),
            'auc_score': float(roc_auc.get(i, 0.0))
        }
    
    # Add recommendations based on performance
    if accuracy < 0.8:
        report['recommendations'].append('Consider model retraining or feature engineering')
    if min(class_accuracies) < 0.7:
        worst_class = label_encoder.classes_[np.argmin(class_accuracies)]
        report['recommendations'].append(f'Improve {worst_class} class prediction')
    if max(roc_auc.values()) < 0.8:
        report['recommendations'].append('Consider different algorithms or ensemble methods')
    
    if not report['recommendations']:
        report['recommendations'].append('Model performance is satisfactory')
    
    return report

# Generate and display report
model_report = generate_model_report()

print('=== MODEL QUALITY REPORT ===')
print(json.dumps(model_report, indent=2))

# Save report
report_path = '/home/jovyan/data/models/model_quality_report.json'
with open(report_path, 'w') as f:
    json.dump(model_report, f, indent=2)

print(f'\nReport saved to: {report_path}')

## 10. Production Readiness Checklist

In [None]:
# Production readiness checklist
checklist = {
    'Model Performance': {
        'Accuracy > 80%': accuracy > 0.8,
        'All classes AUC > 70%': min(roc_auc.values()) > 0.7,
        'Balanced class performance': (max(class_accuracies) - min(class_accuracies)) < 0.3
    },
    'Technical Requirements': {
        'Model serialized': True,  # We saved the model
        'Preprocessing pipeline included': True,  # Part of the model
        'Label encoder available': True,  # We saved it
        'Metadata documented': True,  # We have metadata
    },
    'API Integration': {
        'API endpoint accessible': api_healthy if 'api_healthy' in locals() else False,
        'Prediction format standardized': True,  # JSON format
        'Error handling implemented': False,  # Need to implement
    },
    'Monitoring & Logging': {
        'Prediction logging ready': True,  # Functions created
        'Performance monitoring setup': True,  # Functions created
        'Data drift detection ready': True,  # Function created
        'Alerting system configured': False,  # Need to implement
    }
}

print('=== PRODUCTION READINESS CHECKLIST ===')
print()

overall_ready = True
for category, checks in checklist.items():
    print(f'{category}:')
    category_ready = True
    for check, status in checks.items():
        status_icon = '✓' if status else '✗'
        print(f'  {status_icon} {check}')
        if not status:
            category_ready = False
            overall_ready = False
    print(f'  Category status: {"READY" if category_ready else "NEEDS WORK"}')
    print()

print(f'Overall Production Readiness: {"READY" if overall_ready else "NEEDS WORK"}')

if not overall_ready:
    print('\nRecommended next steps:')
    print('- Implement proper error handling in API')
    print('- Set up alerting system for monitoring')
    print('- Create automated model validation pipeline')
    print('- Implement model versioning strategy')