# Comparación de técnicas de imputación de valores perdidos

## Importaciones y carga del dataset

In [59]:
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import time
import numpy as np
#from sklearn.base import accuracy_score
from sklearn.metrics import f1_score

RANDOM_STATE = 0

def simulate_missingness(X, missing_rate=0.05, seed=RANDOM_STATE):
    rng = np.random.RandomState(seed)
    X2 = X.astype(float).copy()
    n, d = X2.shape
    m = int(missing_rate * n * d)
    idx = rng.choice(n*d, m, replace=False)
    X2[idx // d, idx % d] = np.nan
    return X2

# 1. Load the "Bunch" object
data = load_breast_cancer()

df_features = pd.DataFrame(data.data, columns=data.feature_names); df_target = pd.Series(data.target, name='target')
df = pd.concat([df_features, df_target], axis=1)

X = df.drop('target', axis=1)
y = df['target']

simulate_missingness(X)
print('X shape:', X.shape, '| y shape:', y.shape)
print("\n\\-----------------------------Visualización de Valores perdidos-----------------------------/")
for columna in X.columns:
	perdidos = X[columna].isnull().sum() / len(X[columna]) * 100
	if perdidos != 0:
		print(f"La columna {columna} tiene {perdidos:.2f}% valores perdidos")
print("\nEl resto de columnas no tienen valores perdidos")

TypeError: unhashable type: 'numpy.ndarray'

## Base

In [47]:
df_base = df.copy()
X = df_base.drop(columns='survived')
y = df_base['survived']

In [48]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=RANDOM_STATE
)

In [50]:
model = LogisticRegression(penalty='l2', solver='liblinear', random_state=RANDOM_STATE)
#Comienza la medición del tiempo
t0 = time.perf_counter()

model.fit(X_train, y_train);

#Finaliza la medición del tiempo
t_base = time.perf_counter() - t0

yp = model.predict(X_test)
acc_base = accuracy_score(y_test, yp)
f1_base = f1_score(y_test, yp, average='macro')

print('BASELINE')
print(f'Accuracy: {acc_base:.4f}  |  F1-macro: {f1_base:.4f}  |  tiempo: {t_base:.3f}s')

ValueError: could not convert string to float: 'Franklin, Mr. Charles (Charles Fardon)'

## Imputación básica

In [None]:
df_básica = df.copy()