# 🤖 Phase 4: Model Development

This notebook demonstrates comprehensive model development techniques for MLOps, covering model selection, training, validation, and evaluation.

## Table of Contents
1. [Model Selection](#1-model-selection)
2. [Model Training](#2-model-training)
3. [Model Validation](#3-model-validation)
4. [Model Evaluation](#4-model-evaluation)

---

## Prerequisites
Make sure you have the required libraries installed:
```bash
pip install pandas numpy scikit-learn matplotlib seaborn plotly xgboost lightgbm
```


In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
import lightgbm as lgb
import warnings
import joblib
from datetime import datetime

warnings.filterwarnings('ignore')
np.random.seed(42)

print("✅ Libraries imported successfully!")

# Load data (create sample if not available)
try:
    df = pd.read_parquet('data/processed/dataset_v*.parquet')
    print("✅ Loaded processed data")
except:
    print("⚠️  Creating sample data...")
    np.random.seed(42)
    n_samples = 1000
    
    data = {
        'customer_id': range(1, n_samples + 1),
        'age': np.random.normal(35, 12, n_samples).astype(int),
        'income': np.random.lognormal(10, 0.5, n_samples),
        'education': np.random.choice(['High School', 'Bachelor', 'Master', 'PhD'], n_samples, p=[0.3, 0.4, 0.2, 0.1]),
        'employment_status': np.random.choice(['Employed', 'Unemployed', 'Self-employed', 'Retired'], n_samples, p=[0.6, 0.1, 0.2, 0.1]),
        'credit_score': np.random.normal(650, 100, n_samples).astype(int),
        'loan_amount': np.random.exponential(50000, n_samples),
        'default_risk': np.random.choice([0, 1], n_samples, p=[0.8, 0.2]),
        'application_date': pd.date_range('2020-01-01', periods=n_samples, freq='D'),
        'city': np.random.choice(['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix'], n_samples, p=[0.2, 0.15, 0.15, 0.15, 0.35]),
        'marital_status': np.random.choice(['Single', 'Married', 'Divorced', 'Widowed'], n_samples, p=[0.4, 0.4, 0.15, 0.05]),
        'dependents': np.random.poisson(1.5, n_samples),
        'previous_loans': np.random.poisson(2, n_samples),
        'late_payments': np.random.poisson(0.5, n_samples),
        'debt_to_income_ratio': np.random.beta(2, 5, n_samples),
        'credit_utilization': np.random.beta(3, 2, n_samples),
        'home_ownership': np.random.choice(['Rent', 'Own', 'Mortgage'], n_samples, p=[0.4, 0.2, 0.4]),
        'purpose': np.random.choice(['Debt Consolidation', 'Home Improvement', 'Business', 'Education'], n_samples, p=[0.4, 0.2, 0.2, 0.2])
    }
    
    df = pd.DataFrame(data)
    print("✅ Sample dataset created")

print(f"📊 Dataset shape: {df.shape}")


## 1. Model Selection

**Purpose**: Identify and compare different machine learning algorithms.


In [None]:
# 1.1 Prepare Data for Modeling
print("🤖 Step 11: Identify Candidate Models")
print("=" * 50)

# Prepare features and target
X = df.drop(['customer_id', 'default_risk', 'application_date'], axis=1, errors='ignore')
y = df['default_risk']

# Handle categorical variables
X_encoded = pd.get_dummies(X, drop_first=True)
print(f"Features shape: {X_encoded.shape}")
print(f"Target distribution: {y.value_counts()}")

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42, stratify=y)
print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

# 1.2 Define Candidate Models
print("\n🎯 Candidate Models")
print("-" * 30)

models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Random Forest': RandomForestClassifier(random_state=42, n_estimators=100),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'SVM': SVC(random_state=42, probability=True),
    'XGBoost': xgb.XGBClassifier(random_state=42, eval_metric='logloss'),
    'LightGBM': lgb.LGBMClassifier(random_state=42, verbose=-1)
}

print("Models to evaluate:")
for name in models.keys():
    print(f"  - {name}")

# 1.3 Initial Model Evaluation
print("\n📊 Initial Model Performance")
print("-" * 30)

model_scores = {}

for name, model in models.items():
    print(f"\nEvaluating {name}...")
    
    # Cross-validation
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    
    # Train and test
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    
    model_scores[name] = {
        'cv_mean': cv_scores.mean(),
        'cv_std': cv_scores.std(),
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'roc_auc': roc_auc
    }
    
    print(f"  CV Accuracy: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")
    print(f"  Test Accuracy: {accuracy:.3f}")
    print(f"  Test F1-Score: {f1:.3f}")
    print(f"  Test ROC-AUC: {roc_auc:.3f}")

# 1.4 Model Comparison
print("\n📈 Model Comparison Summary")
print("-" * 30)

results_df = pd.DataFrame(model_scores).T
results_df = results_df.sort_values('roc_auc', ascending=False)

print(results_df[['cv_mean', 'accuracy', 'f1', 'roc_auc']].round(3))

# Visualize model performance
plt.figure(figsize=(12, 8))
metrics = ['cv_mean', 'accuracy', 'precision', 'recall', 'f1', 'roc_auc']
x = np.arange(len(metrics))
width = 0.12

for i, (model_name, scores) in enumerate(model_scores.items()):
    plt.bar(x + i * width, [scores[metric] for metric in metrics], 
            width, label=model_name, alpha=0.8)

plt.xlabel('Metrics')
plt.ylabel('Score')
plt.title('Model Performance Comparison')
plt.xticks(x + width * 2.5, metrics)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

print("✅ Model selection completed!")


## 2. Model Training

**Purpose**: Train models with optimized hyperparameters.


In [None]:
# 2.1 Hyperparameter Tuning
print("🎯 Step 12: Write Training Code")
print("=" * 50)

# Select best performing models for hyperparameter tuning
best_models = ['Random Forest', 'XGBoost', 'LightGBM']

# Define parameter grids
param_grids = {
    'Random Forest': {
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'XGBoost': {
        'n_estimators': [100, 200, 300],
        'max_depth': [3, 6, 9],
        'learning_rate': [0.01, 0.1, 0.2],
        'subsample': [0.8, 0.9, 1.0]
    },
    'LightGBM': {
        'n_estimators': [100, 200, 300],
        'max_depth': [3, 6, 9],
        'learning_rate': [0.01, 0.1, 0.2],
        'num_leaves': [31, 50, 100]
    }
}

print("Hyperparameter tuning for best models...")

# 2.2 Grid Search for Best Models
tuned_models = {}
tuning_results = {}

for model_name in best_models:
    print(f"\n🔧 Tuning {model_name}...")
    
    model = models[model_name]
    param_grid = param_grids[model_name]
    
    # Use RandomizedSearchCV for efficiency
    grid_search = RandomizedSearchCV(
        model, 
        param_grid, 
        n_iter=20,  # Reduced for demo
        cv=3, 
        scoring='roc_auc',
        random_state=42,
        n_jobs=-1
    )
    
    grid_search.fit(X_train, y_train)
    
    tuned_models[model_name] = grid_search.best_estimator_
    tuning_results[model_name] = {
        'best_params': grid_search.best_params_,
        'best_score': grid_search.best_score_
    }
    
    print(f"  Best parameters: {grid_search.best_params_}")
    print(f"  Best CV score: {grid_search.best_score_:.3f}")

# 2.3 Train Final Models
print("\n🚀 Step 13: Train Models")
print("=" * 50)

final_models = {}

for model_name, model in tuned_models.items():
    print(f"Training final {model_name}...")
    
    # Train on full training set
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    y_pred_proba_test = model.predict_proba(X_test)[:, 1]
    
    # Calculate metrics
    train_accuracy = accuracy_score(y_train, y_pred_train)
    test_accuracy = accuracy_score(y_test, y_pred_test)
    test_precision = precision_score(y_test, y_pred_test)
    test_recall = recall_score(y_test, y_pred_test)
    test_f1 = f1_score(y_test, y_pred_test)
    test_roc_auc = roc_auc_score(y_test, y_pred_proba_test)
    
    final_models[model_name] = {
        'model': model,
        'train_accuracy': train_accuracy,
        'test_accuracy': test_accuracy,
        'test_precision': test_precision,
        'test_recall': test_recall,
        'test_f1': test_f1,
        'test_roc_auc': test_roc_auc,
        'predictions': y_pred_test,
        'probabilities': y_pred_proba_test
    }
    
    print(f"  Train Accuracy: {train_accuracy:.3f}")
    print(f"  Test Accuracy: {test_accuracy:.3f}")
    print(f"  Test F1-Score: {test_f1:.3f}")
    print(f"  Test ROC-AUC: {test_roc_auc:.3f}")

print("✅ Model training completed!")


## 3. Model Validation

**Purpose**: Validate model performance using various techniques.


In [None]:
# 3.1 Cross-Validation Analysis
print("🔍 Step 14: Validate & Evaluate Models")
print("=" * 50)

# Cross-validation for all final models
cv_results = {}

for model_name, model_info in final_models.items():
    print(f"\nCross-validating {model_name}...")
    
    model = model_info['model']
    
    # 5-fold cross-validation
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc')
    
    cv_results[model_name] = {
        'cv_mean': cv_scores.mean(),
        'cv_std': cv_scores.std(),
        'cv_scores': cv_scores
    }
    
    print(f"  CV ROC-AUC: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")

# 3.2 Learning Curves
print("\n📈 Learning Curves")
print("-" * 30)

from sklearn.model_selection import learning_curve

def plot_learning_curve(model, X, y, title):
    train_sizes, train_scores, val_scores = learning_curve(
        model, X, y, cv=3, n_jobs=-1,
        train_sizes=np.linspace(0.1, 1.0, 10),
        scoring='roc_auc'
    )
    
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    val_mean = np.mean(val_scores, axis=1)
    val_std = np.std(val_scores, axis=1)
    
    plt.figure(figsize=(10, 6))
    plt.plot(train_sizes, train_mean, 'o-', color='blue', label='Training Score')
    plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1, color='blue')
    plt.plot(train_sizes, val_mean, 'o-', color='red', label='Validation Score')
    plt.fill_between(train_sizes, val_mean - val_std, val_mean + val_std, alpha=0.1, color='red')
    
    plt.title(f'Learning Curve - {title}')
    plt.xlabel('Training Set Size')
    plt.ylabel('ROC-AUC Score')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()

# Plot learning curves for best models
for model_name in ['Random Forest', 'XGBoost']:
    if model_name in final_models:
        plot_learning_curve(final_models[model_name]['model'], X_train, y_train, model_name)

# 3.3 Validation Summary
print("\n📊 Validation Summary")
print("-" * 30)

validation_summary = pd.DataFrame({
    'Model': list(cv_results.keys()),
    'CV_Mean': [cv_results[name]['cv_mean'] for name in cv_results.keys()],
    'CV_Std': [cv_results[name]['cv_std'] for name in cv_results.keys()],
    'Test_ROC_AUC': [final_models[name]['test_roc_auc'] for name in cv_results.keys()]
}).sort_values('Test_ROC_AUC', ascending=False)

print(validation_summary.round(3))

print("✅ Model validation completed!")


## 4. Model Evaluation

**Purpose**: Comprehensive evaluation with detailed metrics and visualizations.


In [None]:
# 4.1 Detailed Performance Metrics
print("📊 Detailed Model Evaluation")
print("=" * 50)

# Select best model based on ROC-AUC
best_model_name = max(final_models.keys(), key=lambda x: final_models[x]['test_roc_auc'])
best_model_info = final_models[best_model_name]

print(f"Best Model: {best_model_name}")
print(f"Test ROC-AUC: {best_model_info['test_roc_auc']:.3f}")

# Detailed metrics for best model
y_pred_best = best_model_info['predictions']
y_proba_best = best_model_info['probabilities']

print(f"\nDetailed Metrics for {best_model_name}:")
print(f"Accuracy: {best_model_info['test_accuracy']:.3f}")
print(f"Precision: {best_model_info['test_precision']:.3f}")
print(f"Recall: {best_model_info['test_recall']:.3f}")
print(f"F1-Score: {best_model_info['test_f1']:.3f}")
print(f"ROC-AUC: {best_model_info['test_roc_auc']:.3f}")

# 4.2 Confusion Matrix
print("\n🔍 Confusion Matrix")
print("-" * 30)

cm = confusion_matrix(y_test, y_pred_best)
print("Confusion Matrix:")
print(cm)

# Visualize confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['No Default', 'Default'],
            yticklabels=['No Default', 'Default'])
plt.title(f'Confusion Matrix - {best_model_name}')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

# 4.3 ROC Curve
print("\n📈 ROC Curve")
print("-" * 30)

from sklearn.metrics import roc_curve, auc

fpr, tpr, thresholds = roc_curve(y_test, y_proba_best)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, 
         label=f'ROC curve (AUC = {roc_auc:.3f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title(f'ROC Curve - {best_model_name}')
plt.legend(loc="lower right")
plt.grid(True, alpha=0.3)
plt.show()

# 4.4 Feature Importance (for tree-based models)
if best_model_name in ['Random Forest', 'XGBoost', 'LightGBM']:
    print(f"\n🎯 Feature Importance - {best_model_name}")
    print("-" * 30)
    
    if hasattr(best_model_info['model'], 'feature_importances_'):
        feature_importance = pd.DataFrame({
            'feature': X_encoded.columns,
            'importance': best_model_info['model'].feature_importances_
        }).sort_values('importance', ascending=False)
        
        print("Top 10 Most Important Features:")
        print(feature_importance.head(10))
        
        # Plot feature importance
        plt.figure(figsize=(10, 8))
        top_features = feature_importance.head(15)
        plt.barh(range(len(top_features)), top_features['importance'])
        plt.yticks(range(len(top_features)), top_features['feature'])
        plt.xlabel('Feature Importance')
        plt.title(f'Feature Importance - {best_model_name}')
        plt.gca().invert_yaxis()
        plt.tight_layout()
        plt.show()

# 4.5 Model Comparison Visualization
print("\n📊 Model Comparison")
print("-" * 30)

# Create comparison plot
models_comparison = pd.DataFrame({
    'Model': list(final_models.keys()),
    'Accuracy': [final_models[name]['test_accuracy'] for name in final_models.keys()],
    'Precision': [final_models[name]['test_precision'] for name in final_models.keys()],
    'Recall': [final_models[name]['test_recall'] for name in final_models.keys()],
    'F1-Score': [final_models[name]['test_f1'] for name in final_models.keys()],
    'ROC-AUC': [final_models[name]['test_roc_auc'] for name in final_models.keys()]
}).sort_values('ROC-AUC', ascending=False)

print(models_comparison.round(3))

# Save best model
best_model = best_model_info['model']
joblib.dump(best_model, f'models/best_model_{best_model_name.lower().replace(" ", "_")}.joblib')
print(f"\n✅ Best model saved as: models/best_model_{best_model_name.lower().replace(' ', '_')}.joblib")

print("✅ Model evaluation completed!")
