# MLimputer - Performance Evaluation

This notebook demonstrates how to:
1. Compare multiple imputation strategies
2. Use cross-validation for robust evaluation
3. Identify the best performing strategy
4. Evaluate on holdout test set

## Setup and Imports

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier

from mlimputer import MLimputer
from mlimputer.evaluation.evaluator import Evaluator
from mlimputer.schemas.parameters import imputer_parameters, update_model_config
from mlimputer.data.data_generator import ImputationDatasetGenerator
from mlimputer.utils.serialization import ModelSerializer

import warnings
warnings.filterwarnings("ignore")

print("="*60)
print("MLIMPUTER - PERFORMANCE EVALUATION")
print("="*60)

## Generate Binary Classification Dataset

Create a dataset with 2000 samples and 15% missing values

In [None]:
generator = ImputationDatasetGenerator(random_state=42)

TASK = "binary_classification"
X, y = generator.quick_binary(n_samples=2000, missing_rate=0.15)

print(f"Task: {TASK}")
print(f"Dataset: {X.shape}")
print(f"Missing: {X.isnull().sum().sum()} values")
print(f"Target distribution:\n{y.value_counts()}")

## Define Predictive Models

We'll evaluate imputation strategies using multiple classifiers

In [None]:
predictive_models = [
    RandomForestClassifier(n_estimators=50, random_state=42),
    ExtraTreesClassifier(n_estimators=50, random_state=42),
    GradientBoostingClassifier(n_estimators=50, learning_rate=0.1, random_state=42),
    DecisionTreeClassifier(max_depth=10, random_state=42),
]

primary_metric = "F1"

print(f"Predictive models: {[m.__class__.__name__ for m in predictive_models]}")
print(f"Primary metric: {primary_metric}")

## Train/Test Split

In [None]:
data = pd.concat([X, y], axis=1)
train_size = int(0.8 * len(data))
train = data.iloc[:train_size].reset_index(drop=True)
test = data.iloc[train_size:].reset_index(drop=True)

print(f"Training set: {train.shape}")
print(f"Test set: {test.shape}")

## Configure Imputation Strategies

We'll compare 4 different strategies:
- Random Forest
- Extra Trees  
- Gradient Boosting
- KNN

In [None]:
params = imputer_parameters()

# Customize parameters
params["RandomForest"] = update_model_config(
    "RandomForest",
    {"n_estimators": 50, "max_depth": 10}
)
params["ExtraTrees"]["n_estimators"] = 50
params["GBR"]["learning_rate"] = 0.05
params["KNN"]["n_neighbors"] = 7

strategies = ["RandomForest", "ExtraTrees", "GBR", "KNN"]
print(f"Strategies to evaluate: {strategies}")

## Run Cross-Validation Evaluation

Use 3-fold cross-validation to compare strategies

In [None]:
evaluator = Evaluator(
    imputation_models=strategies,
    train=train,
    target="target",
    n_splits=3,
    hparameters=params,
    problem_type=TASK
)

cv_results = evaluator.evaluate_imputation_models(models=predictive_models)

## Identify Best Strategy

In [None]:
best_imputer = evaluator.get_best_imputer()
print(f"\n{'='*60}")
print(f"✓ Best imputation strategy: {best_imputer}")
print(f"{'='*60}")

## View Top Results

Show top 5 model-strategy combinations

In [None]:
aggregate = cv_results[cv_results["Fold"] == "Aggregate"]
metric_col = f"{primary_metric} Mean"
top_results = aggregate.nlargest(5, metric_col)

print(f"\nTop 5 combinations by {primary_metric}:")
top_results[["Model", "Imputer Model", metric_col]]

## Evaluate on Test Set

Test the best strategy on holdout data

In [None]:
test_results = evaluator.evaluate_test_set(
    test=test,
    imput_model=best_imputer,
    models=predictive_models
)

print("\nTest Set Performance:")
test_results

## Save Best Model

Save the best imputation strategy for production use

In [None]:
# Fit on full training set
best_imputer_model = MLimputer(
    imput_model=best_imputer,
    imputer_configs=params
)
best_imputer_model.fit(X=train.drop(columns=['target']))

# Save configuration
best_config = {
    "strategy": best_imputer,
    "parameters": params.get(best_imputer, {}),
    "task": TASK,
    "primary_metric": primary_metric
}

ModelSerializer.save(
    obj=best_imputer_model,
    filepath="best_imputer.joblib",
    format="joblib",
    metadata=best_config
)

print("✓ Model saved: best_imputer.joblib")

## Evaluation Summary

In [None]:
summary = evaluator.get_summary_report()

print("\n" + "="*60)
print("EVALUATION SUMMARY")
print("="*60)
print(f"\nDataset: {summary['dataset_shape'][0]} samples, {summary['dataset_shape'][1]} features")
print(f"Task: {TASK}")
print(f"Best imputer: {best_imputer}")
print(f"Primary metric: {primary_metric}")
print(f"Strategies tested: {len(strategies)}")
print(f"Models evaluated: {len(predictive_models)}")
print("\n✓ Evaluation completed!")