## 1. Setup and Data Loading

In [None]:
# Import essential libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set visualization style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)

print("Libraries loaded successfully!")

In [None]:
# Load the prepared dataset from feature engineering notebook
try:
    # Load the final dataset created in the previous notebook
    final_dataset = pd.read_csv('../data/provider_level.csv', index_col='Provider')
    
    print(f"‚úÖ Dataset loaded successfully!")
    print(f"Dataset shape: {final_dataset.shape}")
    
    # Prepare features and target
    X = final_dataset.drop(['PotentialFraud', 'PotentialFraud_numeric'], axis=1)
    y = final_dataset['PotentialFraud_numeric']
    
    print(f"Features shape: {X.shape}")
    print(f"Target distribution:")
    print(f"  Non-fraud: {(y == 0).sum()} ({(y == 0).mean():.1%})")
    print(f"  Fraud: {(y == 1).sum()} ({(y == 1).mean():.1%})")
    
except FileNotFoundError:
    print("‚ùå Dataset not found. Please run the data exploration and feature engineering notebook first.")
    print("Expected file: ../data/provider_level.csv")

## 2. Machine Learning Setup

In [None]:
# Import ML libraries
from sklearn.model_selection import train_test_split, cross_val_predict, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import (
    classification_report, confusion_matrix, precision_recall_curve,
    roc_curve, auc, precision_score, recall_score, f1_score, 
    accuracy_score, roc_auc_score, average_precision_score
)
from sklearn.pipeline import Pipeline
from sklearn.utils.class_weight import compute_class_weight

try:
    import xgboost as xgb
    print("‚úÖ XGBoost available")
except ImportError:
    print("‚ö†Ô∏è XGBoost not available, skipping XGBoost models")
    xgb = None

print("ML libraries loaded successfully!")

In [None]:
# Data preparation for modeling
print("=== Data Preparation for Modeling ===")

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Calculate class weights and scale_pos_weight
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = dict(zip(np.unique(y_train), class_weights))
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()

print(f"Training set: {X_train.shape}, Test set: {X_test.shape}")
print(f"Class weights: {class_weight_dict}")
print(f"Scale pos weight: {scale_pos_weight:.2f}")

# Cross-validation setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
print("‚úÖ Data preparation complete!")

## 3. Model Definition and Training

In [None]:
# Model evaluation function
def evaluate_model(y_true, y_pred, y_pred_proba):
    """Comprehensive model evaluation"""
    return {
        'Accuracy': accuracy_score(y_true, y_pred),
        'Precision': precision_score(y_true, y_pred, average='binary'),
        'Recall': recall_score(y_true, y_pred, average='binary'),
        'F1': f1_score(y_true, y_pred, average='binary'),
        'ROC_AUC': roc_auc_score(y_true, y_pred_proba),
        'PR_AUC': average_precision_score(y_true, y_pred_proba)
    }

# Define models with class weighting
models = {
    'Logistic_Regression': Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler()),
        ('classifier', LogisticRegression(class_weight='balanced', random_state=42, max_iter=1000))
    ]),
    'Random_Forest': Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('classifier', RandomForestClassifier(class_weight='balanced', random_state=42, n_estimators=100))
    ]),
    'Decision_Tree': Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('classifier', DecisionTreeClassifier(class_weight='balanced', random_state=42, max_depth=10))
    ]),
    'SVM': Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler()),
        ('classifier', SVC(class_weight='balanced', random_state=42, probability=True))
    ])
}

if xgb is not None:
    models['XGBoost'] = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('classifier', xgb.XGBClassifier(scale_pos_weight=scale_pos_weight, random_state=42, eval_metric='logloss'))
    ])

print(f"‚úÖ {len(models)} models defined: {list(models.keys())}")

In [None]:
# Train and evaluate all models
print("=== Model Training and Evaluation ===")

results = []
trained_models = {}

for name, pipeline in models.items():
    print(f"\nTraining {name}...")
    
    try:
        # Train the model
        pipeline.fit(X_train, y_train)
        
        # Predictions
        y_pred_test = pipeline.predict(X_test)
        y_pred_proba_test = pipeline.predict_proba(X_test)[:, 1]
        
        # Evaluate
        metrics = evaluate_model(y_test, y_pred_test, y_pred_proba_test)
        metrics['Model'] = name
        results.append(metrics)
        
        # Store trained model
        trained_models[name] = pipeline
        
        print(f"  F1: {metrics['F1']:.4f}, PR-AUC: {metrics['PR_AUC']:.4f}")
        
    except Exception as e:
        print(f"  ‚ùå Error training {name}: {e}")

# Create results DataFrame
results_df = pd.DataFrame(results)
print("\n=== Model Comparison Results ===")
print(results_df.round(4))

## 4. Model Comparison and Visualization

In [None]:
# Model comparison visualization
if len(results_df) > 0:
    # Performance metrics heatmap
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    
    # 1. Performance heatmap
    metrics_cols = ['Precision', 'Recall', 'F1', 'ROC_AUC', 'PR_AUC']
    heatmap_data = results_df.set_index('Model')[metrics_cols]
    
    sns.heatmap(heatmap_data.T, annot=True, fmt='.3f', cmap='RdYlGn', 
                ax=axes[0,0], cbar_kws={'label': 'Score'})
    axes[0,0].set_title('Model Performance Heatmap', fontweight='bold')
    
    # 2. F1 Score comparison
    f1_scores = results_df.sort_values('F1', ascending=True)
    axes[0,1].barh(f1_scores['Model'], f1_scores['F1'], color='skyblue', alpha=0.7)
    axes[0,1].set_xlabel('F1 Score')
    axes[0,1].set_title('F1 Score Comparison', fontweight='bold')
    axes[0,1].grid(True, alpha=0.3)
    
    # 3. Precision vs Recall
    axes[1,0].scatter(results_df['Precision'], results_df['Recall'], 
                     c=results_df['F1'], cmap='viridis', s=100, alpha=0.7)
    for i, model in enumerate(results_df['Model']):
        axes[1,0].annotate(model, 
                          (results_df.iloc[i]['Precision'], results_df.iloc[i]['Recall']),
                          xytext=(5, 5), textcoords='offset points', fontsize=9)
    axes[1,0].set_xlabel('Precision')
    axes[1,0].set_ylabel('Recall')
    axes[1,0].set_title('Precision vs Recall Trade-off', fontweight='bold')
    axes[1,0].grid(True, alpha=0.3)
    
    # 4. ROC vs PR AUC
    axes[1,1].scatter(results_df['ROC_AUC'], results_df['PR_AUC'], 
                     c=results_df['F1'], cmap='plasma', s=100, alpha=0.7)
    for i, model in enumerate(results_df['Model']):
        axes[1,1].annotate(model, 
                          (results_df.iloc[i]['ROC_AUC'], results_df.iloc[i]['PR_AUC']),
                          xytext=(5, 5), textcoords='offset points', fontsize=9)
    axes[1,1].set_xlabel('ROC AUC')
    axes[1,1].set_ylabel('PR AUC')
    axes[1,1].set_title('ROC AUC vs PR AUC', fontweight='bold')
    axes[1,1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Best model summary
    best_f1_model = results_df.loc[results_df['F1'].idxmax()]
    print("\n=== BEST MODEL BY F1-SCORE ===")
    print(f"Model: {best_f1_model['Model']}")
    print(f"F1-Score: {best_f1_model['F1']:.4f}")
    print(f"Precision: {best_f1_model['Precision']:.4f}")
    print(f"Recall: {best_f1_model['Recall']:.4f}")
    print(f"PR-AUC: {best_f1_model['PR_AUC']:.4f}")

## 5. Results and Recommendations

In [None]:
# Final recommendations
if len(results_df) > 0:
    print("="*60)
    print("HEALTHCARE FRAUD DETECTION - FINAL RECOMMENDATIONS")
    print("="*60)
    
    best_model = results_df.loc[results_df['F1'].idxmax()]
    
    print(f"\nüéØ RECOMMENDED MODEL: {best_model['Model']}")
    print(f"\nüìä PERFORMANCE METRICS:")
    print(f"  ‚Ä¢ F1-Score: {best_model['F1']:.4f}")
    print(f"  ‚Ä¢ Precision: {best_model['Precision']:.4f}")
    print(f"  ‚Ä¢ Recall: {best_model['Recall']:.4f}")
    print(f"  ‚Ä¢ PR-AUC: {best_model['PR_AUC']:.4f}")
    
    fraud_detected = int(best_model['Recall'] * (y_test == 1).sum())
    total_fraud = (y_test == 1).sum()
    
    print(f"\nüìà BUSINESS IMPACT:")
    print(f"  ‚Ä¢ Fraud cases detected: {fraud_detected} out of {total_fraud}")
    print(f"  ‚Ä¢ Detection rate: {best_model['Recall']:.1%}")
    print(f"  ‚Ä¢ Precision rate: {best_model['Precision']:.1%}")
    
    print(f"\nüèÜ CLASS IMBALANCE STRATEGY: Class Weighting")
    print(f"  ‚Ä¢ Maintains original data distribution")
    print(f"  ‚Ä¢ Avoids synthetic data problems")
    print(f"  ‚Ä¢ Computationally efficient")
    
    print(f"\nüîß DEPLOYMENT RECOMMENDATIONS:")
    print(f"  1. Implement {best_model['Model']} as primary detection system")
    print(f"  2. Use class weighting for imbalance handling")
    print(f"  3. Set threshold based on business cost considerations")
    print(f"  4. Regular model retraining (quarterly recommended)")
    print(f"  5. Monitor feature importance for model transparency")
    print(f"  6. Implement alerts for significant performance drift")
    
    # Save results
    results_df.to_csv('../data/model_results.csv', index=False)
    print(f"\nüíæ Results saved to: ../data/model_results.csv")
    
    print(f"\n" + "="*60)
    print("ANALYSIS COMPLETE")
    print("="*60)
else:
    print("‚ö†Ô∏è No models were successfully trained. Please check your data and try again.")

## 6. Model Persistence (Optional)

Save the best performing model for production deployment.

In [None]:
# Save the best model
if len(results_df) > 0:
    import joblib
    
    # Get the best model
    best_model_name = results_df.loc[results_df['F1'].idxmax()]['Model']
    best_pipeline = trained_models[best_model_name]
    
    # Save model
    model_path = f'../data/best_fraud_detection_model_{best_model_name.lower()}.pkl'
    joblib.dump(best_pipeline, model_path)
    
    print(f"‚úÖ Best model ({best_model_name}) saved to: {model_path}")
    
    # Save feature names for future prediction
    feature_names = X.columns.tolist()
    joblib.dump(feature_names, '../data/feature_names.pkl')
    
    print(f"‚úÖ Feature names saved to: ../data/feature_names.pkl")
    print(f"\nüìã To use this model for prediction:")
    print(f"   model = joblib.load('{model_path}')")
    print(f"   features = joblib.load('../data/feature_names.pkl')")
    print(f"   predictions = model.predict(new_data)")
else:
    print("‚ö†Ô∏è No trained models available for saving.")