# Comparación de técnicas de imputación de valores perdidos

## Importaciones y carga del dataset

In [217]:
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import time
import numpy as np
#from sklearn.base import accuracy_score
from sklearn.metrics import f1_score, accuracy_score
from sklearn.impute import SimpleImputer

In [218]:
RANDOM_STATE = 0

def simulate_missingness(X, missing_rate=0.05, seed=RANDOM_STATE):
    rng = np.random.RandomState(seed)
    X2 = X.astype(float).copy()
    n, d = X2.shape
    m = int(missing_rate * n * d)
    idx = rng.choice(n*d, m, replace=False)
    row_indices = idx // d
    col_indices = idx % d
    X2.values[row_indices, col_indices] = np.nan
    return X2

# 1. Load the "Bunch" object
data = load_breast_cancer()

df_features = pd.DataFrame(data.data, columns=data.feature_names); df_target = pd.Series(data.target, name='target')
df = pd.concat([df_features, df_target], axis=1)

X = df.drop('target', axis=1)
y = df['target']
print('X shape:', X.shape, '| y shape:', y.shape)

X shape: (569, 30) | y shape: (569,)


## Base

In [219]:
Xtr_base, Xte_base, ytr_base, yte_base = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=RANDOM_STATE
)
print('Train:', Xtr_base.shape, '| Test:', Xte_base.shape)

Train: (426, 30) | Test: (143, 30)


In [220]:
model = LogisticRegression(penalty='l2', solver='liblinear', random_state=RANDOM_STATE)
#Comienza la medición del tiempo
t0 = time.perf_counter()

model.fit(Xtr_base, ytr_base);

#Finaliza la medición del tiempo
t_base = time.perf_counter() - t0

yp = model.predict(Xte_base)
acc_base = accuracy_score(yte_base, yp)
f1_base = f1_score(yte_base, yp, average='macro')

print('BASELINE')
print(f'Accuracy: {acc_base:.4f}  |  F1-macro: {f1_base:.4f}  |  tiempo: {t_base:.3f}s')

BASELINE
Accuracy: 0.9371  |  F1-macro: 0.9328  |  tiempo: 0.008s


In [221]:
model = RandomForestClassifier(n_estimators=6, random_state=RANDOM_STATE, n_jobs=-1)
#Comienza la medición del tiempo
t0 = time.perf_counter()

model.fit(Xtr_base, ytr_base);

#Finaliza la medición del tiempo
t_base = time.perf_counter() - t0

yp = model.predict(Xte_base)
acc_base = accuracy_score(yte_base, yp)
f1_base = f1_score(yte_base, yp, average='macro')

print('BASELINE')
print(f'Accuracy: {acc_base:.4f}  |  F1-macro: {f1_base:.4f}  |  tiempo: {t_base:.3f}s')

BASELINE
Accuracy: 0.9580  |  F1-macro: 0.9554  |  tiempo: 0.018s


## Ahora simulamos que se pierden los valores 

In [222]:
X = simulate_missingness(X)
print("\n\\-----------------------------Visualización de Valores perdidos-----------------------------/")
has_missing = False
for columna in X.columns:
    perdidos = X[columna].isnull().sum() / len(X[columna]) * 100
    if perdidos != 0:
        print(f"La columna {columna} tiene {perdidos:.2f}% valores perdidos")
        has_missing = True

if not has_missing:
    print("No se encontraron valores perdidos. (Esto es inesperado)")


\-----------------------------Visualización de Valores perdidos-----------------------------/
La columna mean radius tiene 7.21% valores perdidos
La columna mean texture tiene 4.04% valores perdidos
La columna mean perimeter tiene 5.62% valores perdidos
La columna mean area tiene 5.27% valores perdidos
La columna mean smoothness tiene 5.62% valores perdidos
La columna mean compactness tiene 5.80% valores perdidos
La columna mean concavity tiene 4.92% valores perdidos
La columna mean concave points tiene 3.16% valores perdidos
La columna mean symmetry tiene 6.15% valores perdidos
La columna mean fractal dimension tiene 3.69% valores perdidos
La columna radius error tiene 4.57% valores perdidos
La columna texture error tiene 4.57% valores perdidos
La columna perimeter error tiene 5.62% valores perdidos
La columna area error tiene 6.33% valores perdidos
La columna smoothness error tiene 6.33% valores perdidos
La columna compactness error tiene 4.04% valores perdidos
La columna concavity 

## Imputación básica

### Eliminación de filas con NaN

In [223]:
X_elim = X.copy()
y_elim = y.copy()

In [224]:
Xtr_elim, Xte_elim, ytr_elim, yte_elim = train_test_split(
    X_elim, y_elim, test_size=0.25, stratify=y_elim, random_state=RANDOM_STATE
)
print('Train:', Xtr_base.shape, '| Test:', Xte_base.shape)

Train: (426, 30) | Test: (143, 30)


In [225]:
Xtr_elim_imp = Xtr_elim.dropna()
ytr_elim_imp = ytr_elim.loc[Xtr_elim_imp.index]

Xte_elim_imp = Xte_elim.dropna()
yte_elim_imp = yte_elim.loc[Xte_elim_imp.index]
print('X shape:', Xtr_elim_imp.shape, '| y shape:', ytr_elim_imp.shape)
print('X shape:', Xte_elim_imp.shape, '| y shape:', yte_elim_imp.shape)

X shape: (99, 30) | y shape: (99,)
X shape: (24, 30) | y shape: (24,)


In [226]:
model = LogisticRegression(penalty='l2', solver='liblinear', random_state=RANDOM_STATE)
#Comienza la medición del tiempo
t0 = time.perf_counter()

model.fit(Xtr_elim_imp, ytr_elim_imp);

#Finaliza la medición del tiempo
t_elim = time.perf_counter() - t0

yp = model.predict(Xte_elim_imp)
acc_elim = accuracy_score(yte_elim_imp, yp)
f1_elim = f1_score(yte_elim_imp, yp, average='macro')

print('ELIMINACIÓN')
print(f'Accuracy: {acc_elim:.4f}  |  F1-macro: {f1_elim:.4f}  |  tiempo: {t_elim:.3f}s')

ELIMINACIÓN
Accuracy: 0.9167  |  F1-macro: 0.9143  |  tiempo: 0.001s


In [227]:
model = RandomForestClassifier(n_estimators=6, random_state=RANDOM_STATE, n_jobs=-1)
#Comienza la medición del tiempo
t0 = time.perf_counter()

model.fit(Xtr_elim_imp, ytr_elim_imp);

#Finaliza la medición del tiempo
t_elim = time.perf_counter() - t0

yp = model.predict(Xte_elim_imp)
acc_elim = accuracy_score(yte_elim_imp, yp)
f1_elim = f1_score(yte_elim_imp, yp, average='macro')

print('ELIMINACIÓN')
print(f'Accuracy: {acc_elim:.4f}  |  F1-macro: {f1_elim:.4f}  |  tiempo: {t_elim:.3f}s')

ELIMINACIÓN
Accuracy: 0.9583  |  F1-macro: 0.9577  |  tiempo: 0.014s


### Imputación estadística básica

Media

In [228]:
X_media = X.copy()
y_media = y.copy()

In [229]:
Xtr_media, Xte_media, ytr_media, yte_media = train_test_split(
    X_media, y_media, test_size=0.25, stratify=y_media, random_state=RANDOM_STATE
)

In [230]:
imp = SimpleImputer(strategy="mean")
Xtr_m_imp = imp.fit_transform(Xtr_media)
Xte_m_imp = imp.transform(Xte_media)

In [231]:
model = LogisticRegression(penalty='l2', solver='liblinear', random_state=RANDOM_STATE)
#Comienza la medición del tiempo
t0 = time.perf_counter()

model.fit(Xtr_m_imp, ytr_media);

#Finaliza la medición del tiempo
t_media = time.perf_counter() - t0

yp = model.predict(Xte_m_imp)
acc_media = accuracy_score(yte_media, yp)
f1_media = f1_score(yte_media, yp, average='macro')

print('IMPUTACIÓN MEDIA')
print(f'Accuracy: {acc_media:.4f}  |  F1-macro: {f1_media:.4f}  |  tiempo: {t_media:.4f}s')

IMPUTACIÓN MEDIA
Accuracy: 0.9510  |  F1-macro: 0.9469  |  tiempo: 0.0030s


In [232]:
model = RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE, n_jobs=-1)
#Comienza la medición del tiempo
t0 = time.perf_counter()

model.fit(Xtr_m_imp, ytr_media);

#Finaliza la medición del tiempo
t_basicaR = time.perf_counter() - t0

yp = model.predict(Xte_m_imp)
acc_media = accuracy_score(yte_media, yp)
f1_media = f1_score(yte_media, yp, average='macro')

print('IMPUTACIÓN MEDIA')
print(f'Accuracy: {acc_media:.4f}  |  F1-macro: {f1_media:.4f}  |  tiempo: {t_media:.4f}s')

IMPUTACIÓN MEDIA
Accuracy: 0.9371  |  F1-macro: 0.9323  |  tiempo: 0.0030s


Mediana

In [233]:
X_mediana = X.copy()
y_mediana = y.copy()

In [234]:
Xtr_mediana, Xte_mediana, ytr_mediana, yte_mediana = train_test_split(
    X_mediana, y_mediana, test_size=0.25, stratify=y_mediana, random_state=RANDOM_STATE
)

In [235]:
imp = SimpleImputer(strategy="median")
Xtr_mm_imp = imp.fit_transform(Xtr_mediana)
Xte_mm_imp = imp.transform(Xte_mediana)

In [236]:
model = LogisticRegression(penalty='l2', solver='liblinear', random_state=RANDOM_STATE)
#Comienza la medición del tiempo
t0 = time.perf_counter()

model.fit(Xtr_mm_imp, ytr_mediana);

#Finaliza la medición del tiempo
t_mediana = time.perf_counter() - t0

yp = model.predict(Xte_mm_imp)
acc_mediana = accuracy_score(yte_mediana, yp)
f1_mediana = f1_score(yte_mediana, yp, average='macro')

print('IMPUTACIÓN MEDIANA')
print(f'Accuracy: {acc_mediana:.4f}  |  F1-macro: {f1_mediana:.4f}  |  tiempo: {t_mediana:.4f}s')

IMPUTACIÓN MEDIANA
Accuracy: 0.9161  |  F1-macro: 0.9101  |  tiempo: 0.0027s


In [237]:
model = RandomForestClassifier(n_estimators=6, random_state=RANDOM_STATE, n_jobs=-1)
#Comienza la medición del tiempo
t0 = time.perf_counter()

model.fit(Xtr_mm_imp, ytr_mediana);

#Finaliza la medición del tiempo
t_basicaR = time.perf_counter() - t0

yp = model.predict(Xte_m_imp)
acc_media = accuracy_score(yte_media, yp)
f1_media = f1_score(yte_media, yp, average='macro')

print('IMPUTACIÓN MEDIA')
print(f'Accuracy: {acc_media:.4f}  |  F1-macro: {f1_media:.4f}  |  tiempo: {t_media:.4f}s')

IMPUTACIÓN MEDIA
Accuracy: 0.9301  |  F1-macro: 0.9256  |  tiempo: 0.0030s
