# Benchmark de Métodos de Imputação

Este notebook compara o desempenho do ISCA-k com outros métodos de imputação:
- **ISCA-k**: Método proposto
- **KNN Imputer**: sklearn
- **MICE (IterativeImputer)**: sklearn
- **MissForest**: missforest package

## Métricas
- **Numéricas**: R², Pearson, NRMSE
- **Categóricas**: Accuracy
- **Tempo**: segundos

## Padrões de Missingness
- **MCAR**: Missing Completely At Random
- **MAR**: Missing At Random
- **MNAR**: Missing Not At Random

In [None]:
import numpy as np
import pandas as pd
import time
import warnings
from scipy.stats import pearsonr
from sklearn.metrics import r2_score, accuracy_score
from sklearn.datasets import load_iris, load_diabetes
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import LabelEncoder
import sys
sys.path.insert(0, '.')

from imputers.iscak_imputer import ISCAkCore

warnings.filterwarnings('ignore')
np.random.seed(42)

## Funções Utilitárias

In [None]:
def introduce_mcar(data, missing_rate=0.2, random_state=42):
    """
    Introduz missings MCAR (Missing Completely At Random).
    Garante que nenhuma linha ou coluna fica 100% vazia.
    """
    np.random.seed(random_state)
    data_missing = data.copy()
    n_rows, n_cols = data.shape
    
    # Criar máscara de missings
    mask = np.random.random((n_rows, n_cols)) < missing_rate
    
    # Garantir pelo menos 1 valor por linha
    for i in range(n_rows):
        if mask[i].all():
            # Manter pelo menos um valor
            keep_idx = np.random.randint(n_cols)
            mask[i, keep_idx] = False
    
    # Garantir pelo menos 1 valor por coluna
    for j in range(n_cols):
        if mask[:, j].all():
            # Manter pelo menos um valor
            keep_idx = np.random.randint(n_rows)
            mask[keep_idx, j] = False
    
    # Aplicar máscara
    for i in range(n_rows):
        for j in range(n_cols):
            if mask[i, j]:
                data_missing.iloc[i, j] = np.nan
    
    return data_missing


def introduce_mar(data, missing_rate=0.2, random_state=42):
    """
    Introduz missings MAR (Missing At Random).
    A probabilidade de missing depende de outras variáveis observadas.
    """
    np.random.seed(random_state)
    data_missing = data.copy()
    n_rows, n_cols = data.shape
    
    # Escolher uma coluna como "driver" (a primeira numérica)
    numeric_cols = data.select_dtypes(include=[np.number]).columns
    if len(numeric_cols) > 0:
        driver_col = numeric_cols[0]
        driver_values = data[driver_col].values
        driver_median = np.nanmedian(driver_values)
        
        # Probabilidade de missing é maior quando driver > mediana
        prob_high = missing_rate * 1.5
        prob_low = missing_rate * 0.5
    else:
        # Fallback para MCAR se não há colunas numéricas
        return introduce_mcar(data, missing_rate, random_state)
    
    mask = np.zeros((n_rows, n_cols), dtype=bool)
    
    for i in range(n_rows):
        for j in range(n_cols):
            if data.columns[j] == driver_col:
                continue  # Não introduzir missing no driver
            
            if driver_values[i] > driver_median:
                prob = prob_high
            else:
                prob = prob_low
            
            if np.random.random() < prob:
                mask[i, j] = True
    
    # Garantir pelo menos 1 valor por linha e coluna
    for i in range(n_rows):
        if mask[i].all():
            keep_idx = np.random.randint(n_cols)
            mask[i, keep_idx] = False
    
    for j in range(n_cols):
        if mask[:, j].all():
            keep_idx = np.random.randint(n_rows)
            mask[keep_idx, j] = False
    
    # Aplicar máscara
    for i in range(n_rows):
        for j in range(n_cols):
            if mask[i, j]:
                data_missing.iloc[i, j] = np.nan
    
    return data_missing


def introduce_mnar(data, missing_rate=0.2, random_state=42):
    """
    Introduz missings MNAR (Missing Not At Random).
    A probabilidade de missing depende do próprio valor.
    """
    np.random.seed(random_state)
    data_missing = data.copy()
    n_rows, n_cols = data.shape
    
    mask = np.zeros((n_rows, n_cols), dtype=bool)
    
    for j in range(n_cols):
        col_data = data.iloc[:, j]
        
        if pd.api.types.is_numeric_dtype(col_data):
            # Para numéricas: valores altos têm maior prob de missing
            col_median = col_data.median()
            for i in range(n_rows):
                if col_data.iloc[i] > col_median:
                    prob = missing_rate * 1.5
                else:
                    prob = missing_rate * 0.5
                
                if np.random.random() < prob:
                    mask[i, j] = True
        else:
            # Para categóricas: categoria mais frequente tem maior prob de missing
            mode_val = col_data.mode().iloc[0] if len(col_data.mode()) > 0 else None
            for i in range(n_rows):
                if col_data.iloc[i] == mode_val:
                    prob = missing_rate * 1.5
                else:
                    prob = missing_rate * 0.5
                
                if np.random.random() < prob:
                    mask[i, j] = True
    
    # Garantir pelo menos 1 valor por linha e coluna
    for i in range(n_rows):
        if mask[i].all():
            keep_idx = np.random.randint(n_cols)
            mask[i, keep_idx] = False
    
    for j in range(n_cols):
        if mask[:, j].all():
            keep_idx = np.random.randint(n_rows)
            mask[keep_idx, j] = False
    
    # Aplicar máscara
    for i in range(n_rows):
        for j in range(n_cols):
            if mask[i, j]:
                data_missing.iloc[i, j] = np.nan
    
    return data_missing

In [None]:
def calculate_nrmse(true_values, imputed_values):
    """Calcula NRMSE (Normalized Root Mean Squared Error)."""
    mask = ~np.isnan(true_values) & ~np.isnan(imputed_values)
    if mask.sum() == 0:
        return np.nan
    
    true_subset = true_values[mask]
    imputed_subset = imputed_values[mask]
    
    rmse = np.sqrt(np.mean((true_subset - imputed_subset) ** 2))
    value_range = true_subset.max() - true_subset.min()
    
    if value_range == 0:
        return np.nan
    
    return rmse / value_range


def calculate_metrics_per_column(original_data, imputed_data, missing_mask, col_types):
    """
    Calcula métricas por coluna e retorna a média.
    
    Args:
        original_data: DataFrame original (sem missings)
        imputed_data: DataFrame imputado
        missing_mask: Máscara booleana de onde havia missings
        col_types: Dict com tipo de cada coluna ('numeric' ou 'categorical')
    
    Returns:
        Dict com métricas médias
    """
    r2_scores = []
    pearson_scores = []
    nrmse_scores = []
    accuracy_scores = []
    
    for col in original_data.columns:
        col_mask = missing_mask[col].values
        if col_mask.sum() == 0:
            continue
        
        true_values = original_data.loc[col_mask, col].values
        imputed_values = imputed_data.loc[col_mask, col].values
        
        # Remover NaN residuais
        valid_mask = ~np.isnan(imputed_values.astype(float))
        if valid_mask.sum() < 2:
            continue
        
        true_subset = true_values[valid_mask]
        imputed_subset = imputed_values[valid_mask]
        
        if col_types.get(col, 'numeric') == 'numeric':
            # Converter para float
            true_float = true_subset.astype(float)
            imputed_float = imputed_subset.astype(float)
            
            # R²
            if len(true_float) >= 2 and np.std(true_float) > 0:
                try:
                    r2 = r2_score(true_float, imputed_float)
                    r2_scores.append(r2)
                except:
                    pass
            
            # Pearson
            if len(true_float) >= 2 and np.std(true_float) > 0 and np.std(imputed_float) > 0:
                try:
                    corr, _ = pearsonr(true_float, imputed_float)
                    if np.isfinite(corr):
                        pearson_scores.append(corr)
                except:
                    pass
            
            # NRMSE
            nrmse = calculate_nrmse(true_float, imputed_float)
            if np.isfinite(nrmse):
                nrmse_scores.append(nrmse)
        else:
            # Accuracy para categóricas
            try:
                # Converter para string para comparação
                true_str = [str(x) for x in true_subset]
                imputed_str = [str(x) for x in imputed_subset]
                acc = accuracy_score(true_str, imputed_str)
                accuracy_scores.append(acc)
            except:
                pass
    
    return {
        'R2': np.mean(r2_scores) if r2_scores else np.nan,
        'Pearson': np.mean(pearson_scores) if pearson_scores else np.nan,
        'NRMSE': np.mean(nrmse_scores) if nrmse_scores else np.nan,
        'Accuracy': np.mean(accuracy_scores) if accuracy_scores else np.nan
    }

In [None]:
def impute_with_iscak(data_missing, verbose=False):
    """Imputa usando ISCA-k."""
    imputer = ISCAkCore(verbose=verbose, fast_mode=True)
    start = time.time()
    result = imputer.impute(data_missing, interactive=False)
    elapsed = time.time() - start
    return result, elapsed


def impute_with_knn(data_missing, n_neighbors=5):
    """Imputa usando KNN Imputer do sklearn."""
    # KNN só funciona com dados numéricos
    data_encoded = data_missing.copy()
    encoders = {}
    
    for col in data_encoded.columns:
        if data_encoded[col].dtype == 'object' or data_encoded[col].dtype.name == 'category':
            le = LabelEncoder()
            non_null = data_encoded[col].dropna()
            if len(non_null) > 0:
                le.fit(non_null)
                encoders[col] = le
                mask = data_encoded[col].notna()
                data_encoded.loc[mask, col] = le.transform(data_encoded.loc[mask, col])
            data_encoded[col] = pd.to_numeric(data_encoded[col], errors='coerce')
    
    imputer = KNNImputer(n_neighbors=n_neighbors)
    start = time.time()
    imputed_array = imputer.fit_transform(data_encoded)
    elapsed = time.time() - start
    
    result = pd.DataFrame(imputed_array, columns=data_missing.columns, index=data_missing.index)
    
    # Decodificar categóricas
    for col, le in encoders.items():
        result[col] = result[col].round().astype(int)
        result[col] = result[col].clip(0, len(le.classes_) - 1)
        result[col] = le.inverse_transform(result[col])
    
    return result, elapsed


def impute_with_mice(data_missing, max_iter=10):
    """Imputa usando MICE (IterativeImputer) do sklearn."""
    # MICE só funciona com dados numéricos
    data_encoded = data_missing.copy()
    encoders = {}
    
    for col in data_encoded.columns:
        if data_encoded[col].dtype == 'object' or data_encoded[col].dtype.name == 'category':
            le = LabelEncoder()
            non_null = data_encoded[col].dropna()
            if len(non_null) > 0:
                le.fit(non_null)
                encoders[col] = le
                mask = data_encoded[col].notna()
                data_encoded.loc[mask, col] = le.transform(data_encoded.loc[mask, col])
            data_encoded[col] = pd.to_numeric(data_encoded[col], errors='coerce')
    
    imputer = IterativeImputer(max_iter=max_iter, random_state=42)
    start = time.time()
    imputed_array = imputer.fit_transform(data_encoded)
    elapsed = time.time() - start
    
    result = pd.DataFrame(imputed_array, columns=data_missing.columns, index=data_missing.index)
    
    # Decodificar categóricas
    for col, le in encoders.items():
        result[col] = result[col].round().astype(int)
        result[col] = result[col].clip(0, len(le.classes_) - 1)
        result[col] = le.inverse_transform(result[col])
    
    return result, elapsed


def impute_with_missforest(data_missing, max_iter=10):
    """Imputa usando MissForest."""
    # Tentar diferentes formas de importar o MissForest
    MissForest = None
    
    # Tentativa 1: missforest.MissForest (versão 4.x)
    try:
        from missforest import MissForest as MF
        MissForest = MF
    except ImportError:
        pass
    
    # Tentativa 2: missforest.missforest.MissForest
    if MissForest is None:
        try:
            from missforest.missforest import MissForest as MF
            MissForest = MF
        except ImportError:
            pass
    
    # Tentativa 3: missingpy (pacote alternativo)
    if MissForest is None:
        try:
            from missingpy import MissForest as MF
            MissForest = MF
        except ImportError:
            pass
    
    if MissForest is None:
        print("MissForest não disponível. Instalar: pip install missforest ou pip install missingpy")
        return data_missing.copy(), np.nan
    
    # Preparar dados (MissForest precisa de dados numéricos)
    data_encoded = data_missing.copy()
    encoders = {}
    
    for col in data_encoded.columns:
        if data_encoded[col].dtype == 'object' or data_encoded[col].dtype.name == 'category':
            le = LabelEncoder()
            non_null = data_encoded[col].dropna()
            if len(non_null) > 0:
                le.fit(non_null)
                encoders[col] = le
                mask = data_encoded[col].notna()
                data_encoded.loc[mask, col] = le.transform(data_encoded.loc[mask, col])
            data_encoded[col] = pd.to_numeric(data_encoded[col], errors='coerce')
    
    # Tentar criar instância com diferentes parâmetros
    try:
        # Versão 4.x do missforest
        mf = MissForest(max_iter=max_iter)
    except TypeError:
        try:
            # Versão alternativa sem max_iter
            mf = MissForest()
        except Exception as e:
            print(f"Erro ao criar MissForest: {e}")
            return data_missing.copy(), np.nan
    
    start = time.time()
    try:
        # fit_transform pode retornar DataFrame ou array
        imputed_result = mf.fit_transform(data_encoded)
        
        if isinstance(imputed_result, pd.DataFrame):
            result = imputed_result
        else:
            result = pd.DataFrame(imputed_result, columns=data_missing.columns, index=data_missing.index)
    except Exception as e:
        print(f"Erro durante imputação MissForest: {e}")
        return data_missing.copy(), np.nan
    
    elapsed = time.time() - start
    
    # Decodificar categóricas
    for col, le in encoders.items():
        result[col] = result[col].round().astype(int)
        result[col] = result[col].clip(0, len(le.classes_) - 1)
        result[col] = le.inverse_transform(result[col])
    
    return result, elapsed

In [None]:
def run_benchmark(data_original, dataset_name, col_types, missing_rates=[0.2, 0.3, 0.4, 0.5],
                  patterns=['MCAR', 'MAR', 'MNAR'], methods=['ISCA-k', 'KNN', 'MICE', 'MissForest']):
    """
    Executa benchmark completo para um dataset.
    
    Returns:
        DataFrame com resultados
    """
    results = []
    
    pattern_funcs = {
        'MCAR': introduce_mcar,
        'MAR': introduce_mar,
        'MNAR': introduce_mnar
    }
    
    method_funcs = {
        'ISCA-k': impute_with_iscak,
        'KNN': impute_with_knn,
        'MICE': impute_with_mice,
        'MissForest': impute_with_missforest
    }
    
    total_runs = len(missing_rates) * len(patterns) * len(methods)
    current_run = 0
    
    for pattern in patterns:
        for rate in missing_rates:
            # Introduzir missings
            data_missing = pattern_funcs[pattern](data_original, rate)
            missing_mask = data_missing.isna()
            
            actual_rate = missing_mask.sum().sum() / data_missing.size
            
            for method in methods:
                current_run += 1
                print(f"  [{current_run}/{total_runs}] {pattern} {int(rate*100)}% - {method}...", end=" ")
                
                try:
                    imputed_data, elapsed = method_funcs[method](data_missing)
                    
                    # Calcular métricas
                    metrics = calculate_metrics_per_column(
                        data_original, imputed_data, missing_mask, col_types
                    )
                    
                    results.append({
                        'Dataset': dataset_name,
                        'Pattern': pattern,
                        'Missing_Rate': f"{int(rate*100)}%",
                        'Actual_Rate': f"{actual_rate*100:.1f}%",
                        'Method': method,
                        'R2': metrics['R2'],
                        'Pearson': metrics['Pearson'],
                        'NRMSE': metrics['NRMSE'],
                        'Accuracy': metrics['Accuracy'],
                        'Time_s': elapsed
                    })
                    
                    print(f"OK ({elapsed:.2f}s)")
                    
                except Exception as e:
                    print(f"ERRO: {e}")
                    results.append({
                        'Dataset': dataset_name,
                        'Pattern': pattern,
                        'Missing_Rate': f"{int(rate*100)}%",
                        'Actual_Rate': f"{actual_rate*100:.1f}%",
                        'Method': method,
                        'R2': np.nan,
                        'Pearson': np.nan,
                        'NRMSE': np.nan,
                        'Accuracy': np.nan,
                        'Time_s': np.nan
                    })
    
    return pd.DataFrame(results)

## Carregar Datasets

In [None]:
# === IRIS (numérico) ===
iris_data = load_iris()
data_iris = pd.DataFrame(iris_data.data, columns=iris_data.feature_names)
col_types_iris = {col: 'numeric' for col in data_iris.columns}
print(f"Iris: {data_iris.shape}")

# === DIABETES (numérico) ===
diabetes_data = load_diabetes()
data_diabetes = pd.DataFrame(diabetes_data.data, columns=diabetes_data.feature_names)
col_types_diabetes = {col: 'numeric' for col in data_diabetes.columns}
print(f"Diabetes: {data_diabetes.shape}")

# === WINE (assumindo ficheiro local) ===
try:
    data_wine = pd.read_csv("dataset.csv", sep=";")
    col_types_wine = {col: 'numeric' for col in data_wine.columns}
    print(f"Wine: {data_wine.shape}")
except:
    data_wine = None
    print("Wine: não encontrado")

# === SONAR (numérico) ===
try:
    from sklearn.datasets import fetch_openml
    sonar_data = fetch_openml('sonar', version=1, parser='auto', as_frame=True)
    data_sonar = pd.DataFrame(sonar_data.data).select_dtypes(include=[np.number])
    col_types_sonar = {col: 'numeric' for col in data_sonar.columns}
    print(f"Sonar: {data_sonar.shape}")
except:
    data_sonar = None
    print("Sonar: erro ao carregar")

# === TITANIC (misto) ===
try:
    url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
    data_titanic_raw = pd.read_csv(url)
    # Seleccionar colunas úteis e limpar
    data_titanic = data_titanic_raw[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']].dropna()
    col_types_titanic = {
        'Pclass': 'categorical', 'Sex': 'categorical', 'Age': 'numeric',
        'SibSp': 'numeric', 'Parch': 'numeric', 'Fare': 'numeric', 'Embarked': 'categorical'
    }
    print(f"Titanic: {data_titanic.shape}")
except:
    data_titanic = None
    print("Titanic: erro ao carregar")

# === CREDIT APPROVAL (misto) ===
try:
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/credit-screening/crx.data"
    column_names = ['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 
                    'A10', 'A11', 'A12', 'A13', 'A14', 'A15', 'A16']
    data_credit = pd.read_csv(url, header=None, names=column_names, na_values='?')
    data_credit = data_credit.dropna()  # Remover NaN originais para ter ground truth
    
    # Detectar tipos
    col_types_credit = {}
    for col in data_credit.columns:
        if pd.api.types.is_numeric_dtype(data_credit[col]):
            col_types_credit[col] = 'numeric'
        else:
            col_types_credit[col] = 'categorical'
    print(f"Credit: {data_credit.shape}")
except:
    data_credit = None
    print("Credit: erro ao carregar")

## Executar Benchmarks

In [None]:
print("=" * 60)
print("BENCHMARK: IRIS")
print("=" * 60)
results_iris = run_benchmark(data_iris, 'Iris', col_types_iris)
display(results_iris)

In [None]:
print("=" * 60)
print("BENCHMARK: DIABETES")
print("=" * 60)
results_diabetes = run_benchmark(data_diabetes, 'Diabetes', col_types_diabetes)
display(results_diabetes)

In [None]:
if data_sonar is not None:
    print("=" * 60)
    print("BENCHMARK: SONAR")
    print("=" * 60)
    results_sonar = run_benchmark(data_sonar, 'Sonar', col_types_sonar)
    display(results_sonar)
else:
    results_sonar = pd.DataFrame()

In [None]:
if data_titanic is not None:
    print("=" * 60)
    print("BENCHMARK: TITANIC (MISTO)")
    print("=" * 60)
    results_titanic = run_benchmark(data_titanic, 'Titanic', col_types_titanic)
    display(results_titanic)
else:
    results_titanic = pd.DataFrame()

In [None]:
if data_credit is not None:
    print("=" * 60)
    print("BENCHMARK: CREDIT (MISTO)")
    print("=" * 60)
    results_credit = run_benchmark(data_credit, 'Credit', col_types_credit)
    display(results_credit)
else:
    results_credit = pd.DataFrame()

## Resultados Consolidados

In [None]:
# Concatenar todos os resultados
all_results = pd.concat([
    results_iris,
    results_diabetes,
    results_sonar if len(results_sonar) > 0 else pd.DataFrame(),
    results_titanic if len(results_titanic) > 0 else pd.DataFrame(),
    results_credit if len(results_credit) > 0 else pd.DataFrame()
], ignore_index=True)

print(f"Total de experimentos: {len(all_results)}")
all_results.to_csv('benchmark_results.csv', index=False)
print("Resultados guardados em 'benchmark_results.csv'")

In [None]:
# Resumo por método
print("\n" + "=" * 60)
print("RESUMO POR MÉTODO (MÉDIAS)")
print("=" * 60)

summary = all_results.groupby('Method').agg({
    'R2': 'mean',
    'Pearson': 'mean',
    'NRMSE': 'mean',
    'Accuracy': 'mean',
    'Time_s': 'mean'
}).round(4)

display(summary)

In [None]:
# Resumo por padrão de missingness
print("\n" + "=" * 60)
print("RESUMO POR PADRÃO DE MISSINGNESS")
print("=" * 60)

for pattern in ['MCAR', 'MAR', 'MNAR']:
    print(f"\n{pattern}:")
    pattern_data = all_results[all_results['Pattern'] == pattern]
    summary_pattern = pattern_data.groupby('Method').agg({
        'R2': 'mean',
        'Pearson': 'mean',
        'NRMSE': 'mean',
        'Accuracy': 'mean',
        'Time_s': 'mean'
    }).round(4)
    display(summary_pattern)

In [None]:
# Resumo por taxa de missing
print("\n" + "=" * 60)
print("RESUMO POR TAXA DE MISSING")
print("=" * 60)

for rate in ['20%', '30%', '40%', '50%']:
    print(f"\n{rate}:")
    rate_data = all_results[all_results['Missing_Rate'] == rate]
    summary_rate = rate_data.groupby('Method').agg({
        'R2': 'mean',
        'Pearson': 'mean',
        'NRMSE': 'mean',
        'Accuracy': 'mean',
        'Time_s': 'mean'
    }).round(4)
    display(summary_rate)