In [1]:
import os
import sys
import numpy as np

# garantir acesso ao projeto
sys.path.append(os.path.abspath(".."))
sys.path.append(os.path.abspath("."))

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from scipy.stats import ttest_rel, wilcoxon, friedmanchisquare


In [2]:
import pickle
from sklearn.datasets import load_breast_cancer

# Carregar encoder treinado
with open("../models/encoder.pkl", "rb") as f:
    encoder = pickle.load(f)

# Carregar base WDBC
data = load_breast_cancer()
X = data.data
y = data.target

print("Base carregada:", X.shape)
print("Encoder carregado!")




Base carregada: (569, 30)
Encoder carregado!


In [3]:
def avaliar_repetidamente(modelo, X, y, reps=30, pca=None, ae=None):
    resultados = []

    for seed in range(reps):
        # Split aleatório
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=seed, stratify=y
        )

        # Escalar
        scaler = StandardScaler()
        X_train_s = scaler.fit_transform(X_train)
        X_test_s  = scaler.transform(X_test)

        # PCA?
        if pca is not None:
            Z_train = pca.fit_transform(X_train_s)
            Z_test  = pca.transform(X_test_s)

        # Autoencoder?
        elif ae is not None:
            Z_train = ae.predict(X_train_s)
            Z_test  = ae.predict(X_test_s)

        else:
            Z_train = X_train_s
            Z_test  = X_test_s

        # Treinar e medir acurácia
        modelo.fit(Z_train, y_train)
        acc = modelo.score(Z_test, y_test)

        resultados.append(acc)

    return np.array(resultados)


In [4]:
# Criar instâncias dos modelos
rf  = RandomForestClassifier()
svm = SVC()

# PCA fixo com 7 componentes
pca = PCA(n_components=7)

print("Gerando distribuições de acurácia...")

acc_rf_o   = avaliar_repetidamente(rf,  X, y, reps=30)
acc_rf_pca = avaliar_repetidamente(rf,  X, y, reps=30, pca=pca)
acc_rf_ae  = avaliar_repetidamente(rf,  X, y, reps=30, ae=encoder)

acc_svm_o   = avaliar_repetidamente(svm, X, y, reps=30)
acc_svm_pca = avaliar_repetidamente(svm, X, y, reps=30, pca=pca)
acc_svm_ae  = avaliar_repetidamente(svm, X, y, reps=30, ae=encoder)

print("Finalizado!")


Gerando distribuições de acurácia...
Finalizado!


In [5]:
acc_rf_o[:5], acc_rf_pca[:5], acc_rf_ae[:5]


(array([0.94736842, 0.96491228, 0.98245614, 0.97368421, 0.99122807]),
 array([0.92982456, 0.92982456, 0.96491228, 0.95614035, 0.96491228]),
 array([0.9122807 , 0.89473684, 0.92982456, 0.93859649, 0.95614035]))

In [6]:
# Teste t pareado
t_rf_o_pca = ttest_rel(acc_rf_o, acc_rf_pca)
t_rf_o_ae  = ttest_rel(acc_rf_o, acc_rf_ae)
t_svm_o_pca = ttest_rel(acc_svm_o, acc_svm_pca)
t_svm_o_ae  = ttest_rel(acc_svm_o, acc_svm_ae)

# Teste Wilcoxon (não-paramétrico)
w_rf_o_pca = wilcoxon(acc_rf_o, acc_rf_pca)
w_rf_o_ae  = wilcoxon(acc_rf_o, acc_rf_ae)
w_svm_o_pca = wilcoxon(acc_svm_o, acc_svm_pca)
w_svm_o_ae  = wilcoxon(acc_svm_o, acc_svm_ae)

# Friedman (todos os 6 modelos)
friedman = friedmanchisquare(
    acc_rf_o, acc_rf_pca, acc_rf_ae,
    acc_svm_o, acc_svm_pca, acc_svm_ae
)

t_rf_o_pca, w_rf_o_pca, friedman




(TtestResult(statistic=2.6567968253329703, pvalue=0.012691458819644648, df=29),
 WilcoxonResult(statistic=83.0, pvalue=0.018369958070923717),
 FriedmanchisquareResult(statistic=75.67415730337086, pvalue=6.728313753079813e-15))

In [7]:
print("RF_O vs RF_PCA:", t_rf_o_pca.pvalue, w_rf_o_pca.pvalue)
print("RF_O vs RF_AE :", t_rf_o_ae.pvalue,  w_rf_o_ae.pvalue)
print("SVM_O vs SVM_PCA:", t_svm_o_pca.pvalue, w_svm_o_pca.pvalue)
print("SVM_O vs SVM_AE :", t_svm_o_ae.pvalue,  w_svm_o_ae.pvalue)


RF_O vs RF_PCA: 0.012691458819644648 0.018369958070923717
RF_O vs RF_AE : 1.5138988300045264e-07 1.5596713509175497e-05
SVM_O vs SVM_PCA: 8.9602360522025e-05 0.0006888879718906282
SVM_O vs SVM_AE : 6.344106314118027e-10 3.1808946484098983e-06


In [None]:
import pickle
os.makedirs("../models", exist_ok=True)

resultados = {
    "acc_rf_o": acc_rf_o,
    "acc_rf_pca": acc_rf_pca,
    "acc_rf_ae": acc_rf_ae,
    "acc_svm_o": acc_svm_o,
    "acc_svm_pca": acc_svm_pca,
    "acc_svm_ae": acc_svm_ae,
}

with open("../models/acc_results.pkl", "wb") as f:
    pickle.dump(resultados, f)

print("Distribuições salvas com sucesso!")


Distribuições salvas com sucesso!
