# Teste de Scale Factors para PDS

Este notebook testa diferentes opções de scale factor para encontrar a configuração óptima.

**Hipótese a testar**: O problema pode ser over-correction (pesos MI re-normalizados + scale factor PDS)

In [None]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from copy import deepcopy

np.random.seed(42)
print("Imports OK")

## 1. Funções de Distância com Scale Factors Configuráveis

In [None]:
def pds_distance_configurable(sample, donors, weights, scale_mode='sqrt'):
    """
    Calcula distâncias PDS com diferentes modos de scale factor.
    
    scale_mode:
        - 'sqrt': sqrt(n/overlap) - actual
        - 'none': 1.0 - sem escala
        - 'linear': n/overlap - linear
        - 'log': 1 + log(n/overlap) - logarítmico
        - 'conditional': só aplica se overlap < 70%
        - 'inverse': sqrt(overlap/n) - recompensa overlap
    """
    n_features = len(sample)
    n_donors = len(donors)
    min_overlap = max(2, n_features // 2)
    
    distances = []
    
    sample_avail = ~np.isnan(sample)
    
    for donor in donors:
        donor_avail = ~np.isnan(donor)
        overlap_mask = sample_avail & donor_avail
        overlap = overlap_mask.sum()
        
        if overlap < min_overlap:
            distances.append(np.inf)
            continue
        
        # Calcular distância raw
        dist_sq = 0.0
        weight_sum = 0.0
        for j in range(n_features):
            if overlap_mask[j]:
                diff = sample[j] - donor[j]
                dist_sq += weights[j] * diff * diff
                weight_sum += weights[j]
        
        dist_raw = np.sqrt(dist_sq) if weight_sum > 0 else np.inf
        
        # Aplicar scale factor conforme modo
        ratio = n_features / overlap
        
        if scale_mode == 'sqrt':
            scale = np.sqrt(ratio)
        elif scale_mode == 'none':
            scale = 1.0
        elif scale_mode == 'linear':
            scale = ratio
        elif scale_mode == 'log':
            scale = 1 + np.log(ratio)
        elif scale_mode == 'conditional':
            # Só aplica se overlap < 70%
            if overlap / n_features < 0.7:
                scale = np.sqrt(ratio)
            else:
                scale = 1.0
        elif scale_mode == 'inverse':
            # Recompensa overlap alto
            scale = np.sqrt(overlap / n_features)
        else:
            scale = 1.0
        
        distances.append(dist_raw * scale)
    
    return np.array(distances)

In [None]:
def impute_with_scale_mode(df_missing, df_complete, scale_mode='sqrt', k=5):
    """
    Imputa valores usando PDS com scale_mode configurável.
    Versão simplificada para testes (sem MI, pesos iguais).
    """
    result = df_missing.copy()
    n_features = len(df_missing.columns)
    weights = np.ones(n_features) / n_features  # Pesos iguais
    
    # Para cada célula missing
    for col in df_missing.columns:
        missing_mask = df_missing[col].isna()
        if not missing_mask.any():
            continue
        
        # Donors: linhas com este valor preenchido
        donor_mask = ~df_missing[col].isna()
        if donor_mask.sum() < 2:
            continue
        
        donor_indices = df_missing[donor_mask].index.tolist()
        donor_values = df_missing.loc[donor_mask, col].values
        
        for idx in df_missing[missing_mask].index:
            sample = df_missing.loc[idx].values.astype(float)
            donors = df_missing.loc[donor_indices].values.astype(float)
            
            # Calcular distâncias
            distances = pds_distance_configurable(sample, donors, weights, scale_mode)
            
            # Seleccionar k vizinhos mais próximos
            valid_mask = np.isfinite(distances)
            if valid_mask.sum() < 1:
                continue
            
            valid_distances = distances[valid_mask]
            valid_values = donor_values[valid_mask]
            
            k_actual = min(k, len(valid_distances))
            top_k_idx = np.argsort(valid_distances)[:k_actual]
            
            top_distances = valid_distances[top_k_idx]
            top_values = valid_values[top_k_idx]
            
            # Média ponderada por distância inversa
            if np.any(top_distances < 1e-10):
                imputed = np.mean(top_values[top_distances < 1e-10])
            else:
                w = 1 / (top_distances + 1e-6)
                w = w / w.sum()
                imputed = np.average(top_values, weights=w)
            
            result.loc[idx, col] = imputed
    
    return result

## 2. Criar Datasets de Teste

In [None]:
def create_clustered_dataset(n_samples=100, n_features=10, n_clusters=3, noise=0.2):
    """Dataset com clusters bem separados"""
    samples_per_cluster = n_samples // n_clusters
    data = []
    
    for c in range(n_clusters):
        center = c * 3  # Clusters em 0, 3, 6
        target_center = (c + 1) * 10  # Targets em 10, 20, 30
        
        features = np.random.normal(center, noise, (samples_per_cluster, n_features))
        targets = np.random.normal(target_center, 1.0, samples_per_cluster)
        
        cluster_data = np.column_stack([features, targets])
        data.append(cluster_data)
    
    all_data = np.vstack(data)
    cols = [f'F{i}' for i in range(n_features)] + ['Target']
    return pd.DataFrame(all_data, columns=cols)

def create_correlated_dataset(n_samples=150, n_features=5):
    """Dataset com features correlacionadas (mais realista)"""
    F1 = np.random.normal(0, 1, n_samples)
    F2 = F1 * 0.8 + np.random.normal(0, 0.5, n_samples)  # Correlação forte
    F3 = np.random.normal(0, 1, n_samples)  # Independente
    F4 = F1 * 0.3 + F3 * 0.3 + np.random.normal(0, 0.7, n_samples)  # Correlação média
    Target = F1 * 2 + F2 * 1.5 + F4 * 0.5 + np.random.normal(0, 1, n_samples)
    
    return pd.DataFrame({'F1': F1, 'F2': F2, 'F3': F3, 'F4': F4, 'Target': Target})

def introduce_missings(df, rate=0.2, seed=42):
    """Introduz missings MCAR"""
    np.random.seed(seed)
    df_missing = df.copy()
    mask = np.random.random(df.shape) < rate
    df_missing = df_missing.mask(mask)
    return df_missing

In [None]:
# Criar datasets
print("Criando datasets de teste...")

# Dataset 1: Clusters separados
df_clustered = create_clustered_dataset(n_samples=150, n_features=8)
print(f"Dataset Clustered: {df_clustered.shape}")

# Dataset 2: Features correlacionadas
df_correlated = create_correlated_dataset(n_samples=150)
print(f"Dataset Correlated: {df_correlated.shape}")

## 3. Testar Scale Factors

In [None]:
def run_scale_test(df_complete, missing_rate, scale_modes, k=5, n_runs=3):
    """
    Testa diferentes scale_modes num dataset.
    Retorna MAE médio para cada modo.
    """
    results = {mode: [] for mode in scale_modes}
    
    for run in range(n_runs):
        # Introduzir missings com seed diferente
        df_missing = introduce_missings(df_complete, rate=missing_rate, seed=42+run)
        
        # Guardar posições dos missings
        missing_positions = []
        for col in df_complete.columns:
            for idx in df_missing.index:
                if pd.isna(df_missing.loc[idx, col]):
                    missing_positions.append((idx, col))
        
        if len(missing_positions) == 0:
            continue
        
        # Testar cada scale mode
        for mode in scale_modes:
            result = impute_with_scale_mode(df_missing.copy(), df_complete, scale_mode=mode, k=k)
            
            # Calcular MAE
            errors = []
            for idx, col in missing_positions:
                true_val = df_complete.loc[idx, col]
                imp_val = result.loc[idx, col]
                if pd.notna(imp_val):
                    errors.append(abs(imp_val - true_val))
            
            mae = np.mean(errors) if errors else np.nan
            results[mode].append(mae)
    
    # Média dos runs
    return {mode: np.mean(maes) for mode, maes in results.items()}

In [None]:
# Definir scale modes a testar
scale_modes = ['sqrt', 'none', 'linear', 'log', 'conditional', 'inverse']

print("="*70)
print("TESTE DE SCALE FACTORS")
print("="*70)
print(f"\nModos a testar: {scale_modes}")

In [None]:
# Teste no dataset CLUSTERED
print("\n" + "="*70)
print("DATASET: CLUSTERED (clusters bem separados)")
print("="*70)

clustered_results = {}
for rate in [0.10, 0.20, 0.30]:
    print(f"\nTaxa de missing: {rate*100:.0f}%")
    results = run_scale_test(df_clustered, rate, scale_modes, k=5, n_runs=3)
    clustered_results[rate] = results
    
    # Ordenar por MAE
    sorted_results = sorted(results.items(), key=lambda x: x[1] if pd.notna(x[1]) else 999)
    for mode, mae in sorted_results:
        marker = "★" if mode == sorted_results[0][0] else " "
        print(f"  {marker} {mode:<12}: MAE = {mae:.4f}")

In [None]:
# Teste no dataset CORRELATED
print("\n" + "="*70)
print("DATASET: CORRELATED (features correlacionadas)")
print("="*70)

correlated_results = {}
for rate in [0.10, 0.20, 0.30]:
    print(f"\nTaxa de missing: {rate*100:.0f}%")
    results = run_scale_test(df_correlated, rate, scale_modes, k=5, n_runs=3)
    correlated_results[rate] = results
    
    # Ordenar por MAE
    sorted_results = sorted(results.items(), key=lambda x: x[1] if pd.notna(x[1]) else 999)
    for mode, mae in sorted_results:
        marker = "★" if mode == sorted_results[0][0] else " "
        print(f"  {marker} {mode:<12}: MAE = {mae:.4f}")

## 4. Análise Comparativa

In [None]:
print("\n" + "="*70)
print("ANÁLISE COMPARATIVA")
print("="*70)

# Criar tabela resumo
print("\n--- DATASET CLUSTERED ---")
print(f"{'Mode':<12} {'10%':<10} {'20%':<10} {'30%':<10} {'Média':<10}")
print("-" * 52)
for mode in scale_modes:
    vals = [clustered_results[r].get(mode, np.nan) for r in [0.10, 0.20, 0.30]]
    mean_val = np.nanmean(vals)
    print(f"{mode:<12} {vals[0]:<10.4f} {vals[1]:<10.4f} {vals[2]:<10.4f} {mean_val:<10.4f}")

print("\n--- DATASET CORRELATED ---")
print(f"{'Mode':<12} {'10%':<10} {'20%':<10} {'30%':<10} {'Média':<10}")
print("-" * 52)
for mode in scale_modes:
    vals = [correlated_results[r].get(mode, np.nan) for r in [0.10, 0.20, 0.30]]
    mean_val = np.nanmean(vals)
    print(f"{mode:<12} {vals[0]:<10.4f} {vals[1]:<10.4f} {vals[2]:<10.4f} {mean_val:<10.4f}")

In [None]:
# Encontrar melhor modo para cada dataset
print("\n" + "="*70)
print("MELHOR SCALE MODE POR DATASET")
print("="*70)

# Clustered
clustered_means = {mode: np.nanmean([clustered_results[r].get(mode, np.nan) for r in [0.10, 0.20, 0.30]]) 
                   for mode in scale_modes}
best_clustered = min(clustered_means.items(), key=lambda x: x[1])
print(f"\nCLUSTERED: Melhor = '{best_clustered[0]}' (MAE médio = {best_clustered[1]:.4f})")

# Correlated
correlated_means = {mode: np.nanmean([correlated_results[r].get(mode, np.nan) for r in [0.10, 0.20, 0.30]]) 
                    for mode in scale_modes}
best_correlated = min(correlated_means.items(), key=lambda x: x[1])
print(f"CORRELATED: Melhor = '{best_correlated[0]}' (MAE médio = {best_correlated[1]:.4f})")

# Geral
overall_means = {mode: (clustered_means[mode] + correlated_means[mode]) / 2 for mode in scale_modes}
best_overall = min(overall_means.items(), key=lambda x: x[1])
print(f"\nGERAL: Melhor = '{best_overall[0]}' (MAE médio = {best_overall[1]:.4f})")

## 5. Teste da Hipótese de Over-Correction

Testar se a combinação de pesos MI re-normalizados + scale factor causa over-correction.

In [None]:
def pds_distance_with_options(sample, donors, weights, 
                               scale_mode='sqrt', 
                               renormalize_weights=True):
    """
    Calcula distâncias PDS com opções para testar over-correction.
    
    renormalize_weights: Se True, re-normaliza pesos para features disponíveis
    """
    n_features = len(sample)
    min_overlap = max(2, n_features // 2)
    
    distances = []
    sample_avail = ~np.isnan(sample)
    
    for donor in donors:
        donor_avail = ~np.isnan(donor)
        overlap_mask = sample_avail & donor_avail
        overlap = overlap_mask.sum()
        
        if overlap < min_overlap:
            distances.append(np.inf)
            continue
        
        # Pesos para features disponíveis
        if renormalize_weights:
            # Re-normalizar (como fazemos actualmente)
            available_weights = weights[overlap_mask]
            if available_weights.sum() > 0:
                available_weights = available_weights / available_weights.sum()
            else:
                available_weights = np.ones(overlap) / overlap
        else:
            # Usar pesos originais sem re-normalizar
            available_weights = weights[overlap_mask]
        
        # Calcular distância
        dist_sq = 0.0
        j_avail = 0
        for j in range(n_features):
            if overlap_mask[j]:
                diff = sample[j] - donor[j]
                dist_sq += available_weights[j_avail] * diff * diff
                j_avail += 1
        
        dist_raw = np.sqrt(dist_sq)
        
        # Scale factor
        ratio = n_features / overlap
        if scale_mode == 'sqrt':
            scale = np.sqrt(ratio)
        elif scale_mode == 'none':
            scale = 1.0
        else:
            scale = 1.0
        
        distances.append(dist_raw * scale)
    
    return np.array(distances)

In [None]:
def test_over_correction(df_complete, missing_rate=0.2, k=5):
    """
    Testa 4 combinações:
    1. renorm=True, scale=sqrt (actual)
    2. renorm=True, scale=none
    3. renorm=False, scale=sqrt
    4. renorm=False, scale=none
    """
    combinations = [
        ('renorm+scale', True, 'sqrt'),
        ('renorm+noscale', True, 'none'),
        ('norenorm+scale', False, 'sqrt'),
        ('norenorm+noscale', False, 'none'),
    ]
    
    results = {}
    
    for name, renorm, scale in combinations:
        maes = []
        
        for run in range(3):
            df_missing = introduce_missings(df_complete, rate=missing_rate, seed=42+run)
            result = df_missing.copy()
            
            n_features = len(df_missing.columns)
            weights = np.ones(n_features) / n_features
            
            missing_positions = []
            
            for col in df_missing.columns:
                missing_mask = df_missing[col].isna()
                if not missing_mask.any():
                    continue
                
                donor_mask = ~df_missing[col].isna()
                if donor_mask.sum() < 2:
                    continue
                
                donor_indices = df_missing[donor_mask].index.tolist()
                donor_values = df_missing.loc[donor_mask, col].values
                
                for idx in df_missing[missing_mask].index:
                    missing_positions.append((idx, col))
                    
                    sample = df_missing.loc[idx].values.astype(float)
                    donors = df_missing.loc[donor_indices].values.astype(float)
                    
                    distances = pds_distance_with_options(
                        sample, donors, weights,
                        scale_mode=scale,
                        renormalize_weights=renorm
                    )
                    
                    valid_mask = np.isfinite(distances)
                    if valid_mask.sum() < 1:
                        continue
                    
                    valid_distances = distances[valid_mask]
                    valid_values = donor_values[valid_mask]
                    
                    k_actual = min(k, len(valid_distances))
                    top_k_idx = np.argsort(valid_distances)[:k_actual]
                    
                    top_distances = valid_distances[top_k_idx]
                    top_values = valid_values[top_k_idx]
                    
                    if np.any(top_distances < 1e-10):
                        imputed = np.mean(top_values[top_distances < 1e-10])
                    else:
                        w = 1 / (top_distances + 1e-6)
                        w = w / w.sum()
                        imputed = np.average(top_values, weights=w)
                    
                    result.loc[idx, col] = imputed
            
            # Calcular MAE
            errors = []
            for idx, col in missing_positions:
                true_val = df_complete.loc[idx, col]
                imp_val = result.loc[idx, col]
                if pd.notna(imp_val):
                    errors.append(abs(imp_val - true_val))
            
            if errors:
                maes.append(np.mean(errors))
        
        results[name] = np.mean(maes) if maes else np.nan
    
    return results

In [None]:
print("\n" + "="*70)
print("TESTE DE OVER-CORRECTION")
print("="*70)
print("\nTestando combinações de re-normalização de pesos + scale factor...")

print("\n--- DATASET CLUSTERED ---")
oc_clustered = test_over_correction(df_clustered, missing_rate=0.2)
for name, mae in sorted(oc_clustered.items(), key=lambda x: x[1]):
    marker = "★" if mae == min(oc_clustered.values()) else " "
    print(f"  {marker} {name:<20}: MAE = {mae:.4f}")

print("\n--- DATASET CORRELATED ---")
oc_correlated = test_over_correction(df_correlated, missing_rate=0.2)
for name, mae in sorted(oc_correlated.items(), key=lambda x: x[1]):
    marker = "★" if mae == min(oc_correlated.values()) else " "
    print(f"  {marker} {name:<20}: MAE = {mae:.4f}")

In [None]:
print("\n" + "="*70)
print("CONCLUSÕES DO TESTE DE OVER-CORRECTION")
print("="*70)

# Comparar actual (renorm+scale) com alternativas
actual_clustered = oc_clustered['renorm+scale']
best_clustered = min(oc_clustered.values())
best_name_clustered = [k for k, v in oc_clustered.items() if v == best_clustered][0]

actual_correlated = oc_correlated['renorm+scale']
best_correlated = min(oc_correlated.values())
best_name_correlated = [k for k, v in oc_correlated.items() if v == best_correlated][0]

print(f"\nCLUSTERED:")
print(f"  Actual (renorm+scale): {actual_clustered:.4f}")
print(f"  Melhor ({best_name_clustered}): {best_clustered:.4f}")
if best_clustered < actual_clustered:
    improvement = (actual_clustered - best_clustered) / actual_clustered * 100
    print(f"  → Melhoria potencial: {improvement:.1f}%")

print(f"\nCORRELATED:")
print(f"  Actual (renorm+scale): {actual_correlated:.4f}")
print(f"  Melhor ({best_name_correlated}): {best_correlated:.4f}")
if best_correlated < actual_correlated:
    improvement = (actual_correlated - best_correlated) / actual_correlated * 100
    print(f"  → Melhoria potencial: {improvement:.1f}%")

## 6. Resumo Final

In [None]:
print("\n" + "="*70)
print("RESUMO FINAL")
print("="*70)
print("""
Este notebook testou diferentes configurações de scale factor para PDS.

VARIÁVEIS TESTADAS:
1. Scale factor: sqrt, none, linear, log, conditional, inverse
2. Re-normalização de pesos: com vs sem

DATASETS:
- Clustered: clusters bem separados (caso fácil)
- Correlated: features correlacionadas (caso realista)

ANÁLISE:
- Ver qual scale factor funciona melhor em cada dataset
- Ver se há over-correction (renorm + scale)
- Identificar configuração óptima
""")