# MLimputer - Detailed Analysis

This notebook provides an in-depth analysis of a single imputation strategy:
1. Column-wise missing data analysis
2. Cross-validation with detailed fold results
3. Test set predictions and metrics
4. Feature importance analysis

## Setup

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

from mlimputer import MLimputer
from mlimputer.evaluation.cross_validation import CrossValidator, CrossValidationConfig
from mlimputer.schemas.parameters import imputer_parameters
from mlimputer.data.data_generator import ImputationDatasetGenerator

import warnings
warnings.filterwarnings("ignore")

print("="*60)
print("MLIMPUTER - DETAILED ANALYSIS")
print("="*60)

## Generate Dataset

Binary classification with higher missing rate (20%)

In [None]:
generator = ImputationDatasetGenerator(random_state=42)
X, y = generator.quick_binary(n_samples=1500, missing_rate=0.20)

print(f"Dataset: {X.shape}")
print(f"Missing: {X.isnull().sum().sum()} values ({X.isnull().sum().sum()/X.size:.1%})")
print(f"\nTarget distribution:")
print(y.value_counts())

## Train/Test Split

In [None]:
data = pd.concat([X, y], axis=1)
train_size = int(0.8 * len(data))
train = data.iloc[:train_size].reset_index(drop=True)
test = data.iloc[train_size:].reset_index(drop=True)

print(f"Training: {train.shape}")
print(f"Test: {test.shape}")

## Configure Random Forest Strategy

Custom configuration with more estimators and depth

In [None]:
params = imputer_parameters()
params["RandomForest"]["n_estimators"] = 100
params["RandomForest"]["max_depth"] = 15
params["RandomForest"]["min_samples_split"] = 5

print("Random Forest Configuration:")
for key, value in params["RandomForest"].items():
    print(f"  {key}: {value}")

## Fit and Transform

In [None]:
imputer = MLimputer(
    imput_model="RandomForest",
    imputer_configs=params
)

# Fit on train (excluding target)
imputer.fit(X=train.drop(columns=['target']))

# Transform both sets
X_train_imputed = imputer.transform(X=train.drop(columns=['target']))
X_test_imputed = imputer.transform(X=test.drop(columns=['target']))

print(f"\n✓ Training imputed: {train.drop(columns=['target']).isnull().sum().sum()} → 0 missing")
print(f"✓ Test imputed: {test.drop(columns=['target']).isnull().sum().sum()} → 0 missing")

## Column-wise Missing Data Analysis

Identify which columns had the most missing values

In [None]:
train_missing = train.drop(columns=['target']).isnull().sum()
train_missing = train_missing[train_missing > 0].sort_values(ascending=False)

print("Top 10 columns with missing values:")
for col, count in train_missing.head(10).items():
    pct = (count / len(train)) * 100
    print(f"  {col}: {count} ({pct:.1f}%)")

## Cross-Validation Analysis

5-fold CV with detailed fold-by-fold results

In [None]:
# Add target back for CV
X_train_imputed['target'] = train['target'].values

cv_config = CrossValidationConfig(
    n_splits=5,
    shuffle=True,
    random_state=42,
    verbose=1
)

validator = CrossValidator(config=cv_config)
model = RandomForestClassifier(n_estimators=50, random_state=42)

cv_results = validator.validate(
    X=X_train_imputed,
    target='target',
    models=[model],
    problem_type='binary_classification'
)

## Fold-by-Fold Results

In [None]:
leaderboard = validator.get_leaderboard()

print("\nFold-by-Fold Performance:")
fold_results = leaderboard[leaderboard['Fold'] != 'Aggregate']
fold_results[['Fold', 'f1', 'accuracy', 'precision', 'recall']]

## Aggregate Results

In [None]:
print("\nAggregate Cross-Validation Results:")
agg_results = leaderboard[leaderboard['Fold'] == 'Aggregate']
agg_results[['F1 Mean', 'ACCURACY Mean', 'PRECISION Mean', 'RECALL Mean']]

## Test Set Prediction

Train final model and evaluate on holdout test set

In [None]:
final_model = RandomForestClassifier(n_estimators=50, random_state=42)
final_model.fit(X_train_imputed.drop(columns=['target']), train['target'])

y_pred = final_model.predict(X_test_imputed)
y_true = test['target']

print("\nClassification Report:")
print(classification_report(y_true, y_pred))

## Confusion Matrix

In [None]:
cm = confusion_matrix(y_true, y_pred)
cm_df = pd.DataFrame(
    cm,
    index=['Actual 0', 'Actual 1'],
    columns=['Predicted 0', 'Predicted 1']
)

print("\nConfusion Matrix:")
cm_df

## Feature Importance Analysis

Identify which features are most important for prediction

In [None]:
importances = final_model.feature_importances_
feature_names = X_train_imputed.drop(columns=['target']).columns

feature_importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importances
}).sort_values('importance', ascending=False)

print("\nTop 10 Most Important Features:")
feature_importance_df.head(10)

## Imputation Summary

In [None]:
summary = imputer.get_summary()

print("\nImputation Summary:")
print(f"  Strategy: {summary['model']}")
print(f"  Columns imputed: {summary['n_columns_imputed']}")
print(f"  Fit timestamp: {summary['fit_timestamp']}")
print(f"  Status: {summary['status']}")
print("\n✓ Detailed analysis completed!")