In [10]:
# =============================================================================
# NOTEBOOK 5: PERFORMANCE OPTIMIZATION AND DEPLOYMENT
# =============================================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn.metrics import precision_recall_curve, confusion_matrix
from imblearn.over_sampling import SMOTE, ADASYN
import warnings
warnings.filterwarnings('ignore')

print("üöÄ NOTEBOOK 5: PERFORMANCE OPTIMIZATION AND DEPLOYMENT")
print("=" * 50)

# Load best model and data
print("üì• Loading best model and data...")
best_model = joblib.load('model_ensemble.pkl')  # Assuming ensemble is best
X_train_scaled = joblib.load('X_train_scaled.pkl')
X_test_scaled = joblib.load('X_test_scaled.pkl')
y_train = joblib.load('y_train.pkl')
y_test = joblib.load('y_test.pkl')

print(f"Training set: {X_train_scaled.shape}")
print(f"Test set: {X_test_scaled.shape}")

# THRESHOLD OPTIMIZATION FOR MAXIMUM RECALL
print("\nüéØ THRESHOLD OPTIMIZATION FOR MAXIMUM RECALL")

# Get probabilities from best model
y_pred_proba = best_model.predict_proba(X_test_scaled)[:, 1]

# Find optimal threshold for target recall
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)

# Find threshold that gives us at least 75% recall
target_recall = 0.75
optimal_threshold_idx = np.argmax(recall >= target_recall)
optimal_threshold = thresholds[optimal_threshold_idx] if optimal_threshold_idx < len(thresholds) else 0.3

print(f"üîç Threshold Analysis:")
print(f"  ‚Ä¢ Default threshold (0.5): Recall = {recall[np.argmax(recall >= 0.5)]:.3f}")
print(f"  ‚Ä¢ Optimal threshold for {target_recall*100}% recall: {optimal_threshold:.3f}")

# Apply optimized threshold
y_pred_optimized = (y_pred_proba >= optimal_threshold).astype(int)

# Calculate optimized metrics
optimized_cm = confusion_matrix(y_test, y_pred_optimized)
tn_opt, fp_opt, fn_opt, tp_opt = optimized_cm.ravel()

optimized_recall = tp_opt / (tp_opt + fn_opt) if (tp_opt + fn_opt) > 0 else 0
optimized_precision = tp_opt / (tp_opt + fp_opt) if (tp_opt + fp_opt) > 0 else 0
optimized_f1 = 2 * (optimized_precision * optimized_recall) / (optimized_precision + optimized_recall) if (optimized_precision + optimized_recall) > 0 else 0

print(f"\nüìä OPTIMIZED PERFORMANCE:")
print(f"  ‚Ä¢ Threshold: {optimal_threshold:.3f}")
print(f"  ‚Ä¢ Recall:    {optimized_recall:.3f} ({optimized_recall*100:.1f}% of leavers detected)")
print(f"  ‚Ä¢ Precision: {optimized_precision:.3f}")
print(f"  ‚Ä¢ F1-Score:  {optimized_f1:.3f}")
print(f"  ‚Ä¢ True Positives:  {tp_opt}")
print(f"  ‚Ä¢ False Negatives: {fn_opt}")

# BUSINESS IMPACT ANALYSIS
print(f"\nüí∞ BUSINESS IMPACT ANALYSIS")
print("=" * 60)

# Cost assumptions
AVERAGE_REPLACEMENT_COST = 50000
TRAINING_COST_PER_EMPLOYEE = 5000

# Calculate business impact with optimized model
risk_scores_optimized = (y_pred_proba >= optimal_threshold).astype(int)
high_risk_employees = risk_scores_optimized.sum()

# Assuming 70% of identified high-risk employees can be retained with interventions
retention_success_rate = 0.70
potential_retentions = int(high_risk_employees * retention_success_rate)

# Cost savings calculation
cost_savings = potential_retentions * AVERAGE

üöÄ NOTEBOOK 5: PERFORMANCE OPTIMIZATION AND DEPLOYMENT
üì• Loading best model and data...
Training set: (1176, 30)
Test set: (294, 30)

üéØ THRESHOLD OPTIMIZATION FOR MAXIMUM RECALL
üîç Threshold Analysis:
  ‚Ä¢ Default threshold (0.5): Recall = 1.000
  ‚Ä¢ Optimal threshold for 75.0% recall: 0.010

üìä OPTIMIZED PERFORMANCE:
  ‚Ä¢ Threshold: 0.010
  ‚Ä¢ Recall:    1.000 (100.0% of leavers detected)
  ‚Ä¢ Precision: 0.160
  ‚Ä¢ F1-Score:  0.276
  ‚Ä¢ True Positives:  47
  ‚Ä¢ False Negatives: 0

üí∞ BUSINESS IMPACT ANALYSIS


NameError: name 'AVERAGE' is not defined

In [11]:

# =============================================================================
# NOTEBOOK 5: PERFORMANCE OPTIMIZATION AND DEPLOYMENT
# =============================================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn.metrics import precision_recall_curve, confusion_matrix
from imblearn.over_sampling import SMOTE, ADASYN
import warnings
warnings.filterwarnings('ignore')

print("üöÄ NOTEBOOK 5: PERFORMANCE OPTIMIZATION AND DEPLOYMENT")
print("=" * 50)

# Load best model and data
print("üì• Loading best model and data...")
best_model = joblib.load('model_ensemble.pkl')  # Assuming ensemble is best
X_train_scaled = joblib.load('X_train_scaled.pkl')
X_test_scaled = joblib.load('X_test_scaled.pkl')
y_train = joblib.load('y_train.pkl')
y_test = joblib.load('y_test.pkl')

print(f"Training set: {X_train_scaled.shape}")
print(f"Test set: {X_test_scaled.shape}")

# THRESHOLD OPTIMIZATION FOR MAXIMUM RECALL
print("\nüéØ THRESHOLD OPTIMIZATION FOR MAXIMUM RECALL")

# Get probabilities from best model
y_pred_proba = best_model.predict_proba(X_test_scaled)[:, 1]

# Find optimal threshold for target recall
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)

# Find threshold that gives us at least 75% recall
target_recall = 0.75
optimal_threshold_idx = np.argmax(recall >= target_recall)
optimal_threshold = thresholds[optimal_threshold_idx] if optimal_threshold_idx < len(thresholds) else 0.3

print(f"üîç Threshold Analysis:")
print(f"  ‚Ä¢ Default threshold (0.5): Recall = {recall[np.argmax(recall >= 0.5)]:.3f}")
print(f"  ‚Ä¢ Optimal threshold for {target_recall*100}% recall: {optimal_threshold:.3f}")

# Apply optimized threshold
y_pred_optimized = (y_pred_proba >= optimal_threshold).astype(int)

# Calculate optimized metrics
optimized_cm = confusion_matrix(y_test, y_pred_optimized)
tn_opt, fp_opt, fn_opt, tp_opt = optimized_cm.ravel()

optimized_recall = tp_opt / (tp_opt + fn_opt) if (tp_opt + fn_opt) > 0 else 0
optimized_precision = tp_opt / (tp_opt + fp_opt) if (tp_opt + fp_opt) > 0 else 0
optimized_f1 = 2 * (optimized_precision * optimized_recall) / (optimized_precision + optimized_recall) if (optimized_precision + optimized_recall) > 0 else 0

print(f"\nüìä OPTIMIZED PERFORMANCE:")
print(f"  ‚Ä¢ Threshold: {optimal_threshold:.3f}")
print(f"  ‚Ä¢ Recall:    {optimized_recall:.3f} ({optimized_recall*100:.1f}% of leavers detected)")
print(f"  ‚Ä¢ Precision: {optimized_precision:.3f}")
print(f"  ‚Ä¢ F1-Score:  {optimized_f1:.3f}")
print(f"  ‚Ä¢ True Positives:  {tp_opt}")
print(f"  ‚Ä¢ False Negatives: {fn_opt}")

# BUSINESS IMPACT ANALYSIS
print(f"\nüí∞ BUSINESS IMPACT ANALYSIS")
print("=" * 60)

# Cost assumptions
AVERAGE_REPLACEMENT_COST = 50000
TRAINING_COST_PER_EMPLOYEE = 5000

# Calculate business impact with optimized model
risk_scores_optimized = (y_pred_proba >= optimal_threshold).astype(int)
high_risk_employees = risk_scores_optimized.sum()

# Assuming 70% of identified high-risk employees can be retained with interventions
retention_success_rate = 0.70
potential_retentions = int(high_risk_employees * retention_success_rate)

# Cost savings calculation
cost_savings = potential_retentions * AVERAGE_REPLACEMENT_COST

print(f"  ‚Ä¢ Potential Retentions (assuming {retention_success_rate*100}% success rate): {potential_retentions} employees")
print(f"  ‚Ä¢ Estimated Cost Savings: ${cost_savings:,.2f}")

# Additional business impact metrics
# Cost of false positives (interventions on employees who wouldn't leave)
cost_of_false_positives = fp_opt * TRAINING_COST_PER_EMPLOYEE
print(f"  ‚Ä¢ Estimated Cost of False Positives: ${cost_of_false_positives:,.2f}")

# Cost of false negatives (leavers not identified)
# Assuming average loss per leaver is the replacement cost
cost_of_false_negatives = fn_opt * AVERAGE_REPLACEMENT_COST
print(f"  ‚Ä¢ Estimated Cost of False Negatives: ${cost_of_false_negatives:,.2f}")

# Net savings
net_savings = cost_savings - cost_of_false_positives
print(f"  ‚Ä¢ Estimated Net Savings: ${net_savings:,.2f}")


# DEPLOYMENT CONSIDERATIONS (Placeholder - actual deployment would involve more steps)
print(f"\n‚öôÔ∏è DEPLOYMENT CONSIDERATIONS")
print("=" * 60)
print("  ‚Ä¢ The trained model and scaler can be saved for deployment.")
print("  ‚Ä¢ A real-time or batch scoring process would be needed to apply the model to new data.")
print("  ‚Ä¢ Integration with HR systems would be required for automated risk assessment.")
print("  ‚Ä¢ Ongoing monitoring and retraining of the model will be necessary.")

print("\n‚úÖ Performance optimization and business impact analysis completed.")

üöÄ NOTEBOOK 5: PERFORMANCE OPTIMIZATION AND DEPLOYMENT
üì• Loading best model and data...
Training set: (1176, 30)
Test set: (294, 30)

üéØ THRESHOLD OPTIMIZATION FOR MAXIMUM RECALL
üîç Threshold Analysis:
  ‚Ä¢ Default threshold (0.5): Recall = 1.000
  ‚Ä¢ Optimal threshold for 75.0% recall: 0.010

üìä OPTIMIZED PERFORMANCE:
  ‚Ä¢ Threshold: 0.010
  ‚Ä¢ Recall:    1.000 (100.0% of leavers detected)
  ‚Ä¢ Precision: 0.160
  ‚Ä¢ F1-Score:  0.276
  ‚Ä¢ True Positives:  47
  ‚Ä¢ False Negatives: 0

üí∞ BUSINESS IMPACT ANALYSIS
  ‚Ä¢ Potential Retentions (assuming 70.0% success rate): 205 employees
  ‚Ä¢ Estimated Cost Savings: $10,250,000.00
  ‚Ä¢ Estimated Cost of False Positives: $1,235,000.00
  ‚Ä¢ Estimated Cost of False Negatives: $0.00
  ‚Ä¢ Estimated Net Savings: $9,015,000.00

‚öôÔ∏è DEPLOYMENT CONSIDERATIONS
  ‚Ä¢ The trained model and scaler can be saved for deployment.
  ‚Ä¢ A real-time or batch scoring process would be needed to apply the model to new data.
  ‚Ä¢ Integr