# Comparación de técnicas de imputación de valores perdidos

## Importaciones y carga del dataset

In [50]:
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import time
import numpy as np
#from sklearn.base import accuracy_score
from sklearn.metrics import f1_score, accuracy_score
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.svm import LinearSVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [4]:
RANDOM_STATE = 0

def simulate_missingness(X, missing_rate=0.05, seed=RANDOM_STATE):
    rng = np.random.RandomState(seed)
    X2 = X.astype(float).copy()
    n, d = X2.shape
    m = int(missing_rate * n * d)
    idx = rng.choice(n*d, m, replace=False)
    row_indices = idx // d
    col_indices = idx % d
    X2.values[row_indices, col_indices] = np.nan
    return X2

# 1. Load the "Bunch" object
data = load_breast_cancer()

df_features = pd.DataFrame(data.data, columns=data.feature_names); df_target = pd.Series(data.target, name='target')
df = pd.concat([df_features, df_target], axis=1)

X = df.drop('target', axis=1)
y = df['target']
print('X shape:', X.shape, '| y shape:', y.shape)

X shape: (569, 30) | y shape: (569,)


In [5]:
def lg(Xtr, Xte, ytr, yte):
	model = LogisticRegression(penalty='l2', solver='liblinear', random_state=RANDOM_STATE)
	#Comienza la medición del tiempo
	t0 = time.perf_counter()

	model.fit(Xtr, ytr);

	#Finaliza la medición del tiempo
	t = time.perf_counter() - t0

	yp = model.predict(Xte)
	acc = accuracy_score(yte, yp)
	f1 = f1_score(yte, yp, average='macro')
	return acc, f1, t

In [6]:
def rfc(Xtr, Xte, ytr, yte):
	model = RandomForestClassifier(n_estimators=6, random_state=RANDOM_STATE, n_jobs=-1)
	#Comienza la medición del tiempo
	t0 = time.perf_counter()

	model.fit(Xtr, ytr);

	#Finaliza la medición del tiempo
	t = time.perf_counter() - t0

	yp = model.predict(Xte)
	acc = accuracy_score(yte, yp)
	f1 = f1_score(yte, yp, average='macro')
	return acc, f1, t


## Base

In [7]:
Xtr_base, Xte_base, ytr_base, yte_base = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=RANDOM_STATE
)
print('Train:', Xtr_base.shape, '| Test:', Xte_base.shape)

Train: (426, 30) | Test: (143, 30)


In [8]:
acc_base, f1_base, t_base = lg(Xtr_base, Xte_base, ytr_base, yte_base)

print('BASELINE')
print(f'Accuracy: {acc_base:.4f}  |  F1-macro: {f1_base:.4f}  |  tiempo: {t_base:.3f}s')

BASELINE
Accuracy: 0.9371  |  F1-macro: 0.9328  |  tiempo: 0.005s


In [9]:
acc_base, f1_base, t_base = rfc(Xtr_base, Xte_base, ytr_base, yte_base)
print('BASELINE')
print(f'Accuracy: {acc_base:.4f}  |  F1-macro: {f1_base:.4f}  |  tiempo: {t_base:.3f}s')

BASELINE
Accuracy: 0.9580  |  F1-macro: 0.9554  |  tiempo: 0.027s


## Ahora simulamos que se pierden los valores 

In [10]:
X = simulate_missingness(X)
print("\n\\-----------------------------Visualización de Valores perdidos-----------------------------/")
has_missing = False
for columna in X.columns:
    perdidos = X[columna].isnull().sum() / len(X[columna]) * 100
    if perdidos != 0:
        print(f"La columna {columna} tiene {perdidos:.2f}% valores perdidos")
        has_missing = True

if not has_missing:
    print("No se encontraron valores perdidos. (Esto es inesperado)")


\-----------------------------Visualización de Valores perdidos-----------------------------/
La columna mean radius tiene 7.21% valores perdidos
La columna mean texture tiene 4.04% valores perdidos
La columna mean perimeter tiene 5.62% valores perdidos
La columna mean area tiene 5.27% valores perdidos
La columna mean smoothness tiene 5.62% valores perdidos
La columna mean compactness tiene 5.80% valores perdidos
La columna mean concavity tiene 4.92% valores perdidos
La columna mean concave points tiene 3.16% valores perdidos
La columna mean symmetry tiene 6.15% valores perdidos
La columna mean fractal dimension tiene 3.69% valores perdidos
La columna radius error tiene 4.57% valores perdidos
La columna texture error tiene 4.57% valores perdidos
La columna perimeter error tiene 5.62% valores perdidos
La columna area error tiene 6.33% valores perdidos
La columna smoothness error tiene 6.33% valores perdidos
La columna compactness error tiene 4.04% valores perdidos
La columna concavity 

## Imputación básica

### Eliminación de filas con NaN

In [11]:
X_elim = X.copy()
y_elim = y.copy()
print('X shape:', X_elim.shape, '| y shape:', y_elim.shape)

X shape: (569, 30) | y shape: (569,)


In [12]:
Xtr_elim, Xte_elim, ytr_elim, yte_elim = train_test_split(
    X_elim, y_elim, test_size=0.25, stratify=y_elim, random_state=RANDOM_STATE
)
print('Train:', Xtr_elim.shape, '| Test:', Xte_elim.shape)

Train: (426, 30) | Test: (143, 30)


In [13]:
Xtr_elim_imp = Xtr_elim.dropna()
ytr_elim_imp = ytr_elim.loc[Xtr_elim_imp.index]

Xte_elim_imp = Xte_elim.dropna()
yte_elim_imp = yte_elim.loc[Xte_elim_imp.index]
print('X shape:', Xtr_elim_imp.shape, '| y shape:', ytr_elim_imp.shape)
print('X shape:', Xte_elim_imp.shape, '| y shape:', yte_elim_imp.shape)

X shape: (99, 30) | y shape: (99,)
X shape: (24, 30) | y shape: (24,)


In [14]:
acc_elim, f1_elim, t_elim = lg(Xtr_elim_imp, Xte_elim_imp, ytr_elim_imp, yte_elim_imp)

print('ELIMINACIÓN')
print(f'Accuracy: {acc_elim:.4f}  |  F1-macro: {f1_elim:.4f}  |  tiempo: {t_elim:.3f}s')

ELIMINACIÓN
Accuracy: 0.9167  |  F1-macro: 0.9143  |  tiempo: 0.002s


In [15]:
acc_elim, f1_elim, t_elim = rfc(Xtr_elim_imp, Xte_elim_imp, ytr_elim_imp, yte_elim_imp)

print('ELIMINACIÓN')
print(f'Accuracy: {acc_elim:.4f}  |  F1-macro: {f1_elim:.4f}  |  tiempo: {t_elim:.3f}s')

ELIMINACIÓN
Accuracy: 0.9583  |  F1-macro: 0.9577  |  tiempo: 0.016s


### Imputación estadística básica

Media

In [16]:
X_media = X.copy()
y_media = y.copy()
print('X shape:', X_media.shape, '| y shape:', y_media.shape)

X shape: (569, 30) | y shape: (569,)


In [17]:
Xtr_media, Xte_media, ytr_media, yte_media = train_test_split(
    X_media, y_media, test_size=0.25, stratify=y_media, random_state=RANDOM_STATE
)
print('Train:', Xtr_media.shape, '| Test:', Xte_media.shape)

Train: (426, 30) | Test: (143, 30)


In [18]:
imp = SimpleImputer(strategy="mean")
Xtr_m_imp = imp.fit_transform(Xtr_media)
Xte_m_imp = imp.transform(Xte_media)

In [19]:
acc_media, f1_media, t_media = lg(Xtr_m_imp, Xte_m_imp, ytr_media, yte_media)

print('IMPUTACIÓN MEDIA')
print(f'Accuracy: {acc_media:.4f}  |  F1-macro: {f1_media:.4f}  |  tiempo: {t_media:.4f}s')

IMPUTACIÓN MEDIA
Accuracy: 0.9510  |  F1-macro: 0.9469  |  tiempo: 0.0032s


In [20]:
acc_media, f1_media, t_media = rfc(Xtr_m_imp, Xte_m_imp, ytr_media, yte_media)

print('IMPUTACIÓN MEDIA')
print(f'Accuracy: {acc_media:.4f}  |  F1-macro: {f1_media:.4f}  |  tiempo: {t_media:.4f}s')

IMPUTACIÓN MEDIA
Accuracy: 0.9161  |  F1-macro: 0.9107  |  tiempo: 0.0152s


Mediana

In [21]:
X_mediana = X.copy()
y_mediana = y.copy()
print('X shape:', X_mediana.shape, '| y shape:', y_mediana.shape)

X shape: (569, 30) | y shape: (569,)


In [22]:
Xtr_mediana, Xte_mediana, ytr_mediana, yte_mediana = train_test_split(
    X_mediana, y_mediana, test_size=0.25, stratify=y_mediana, random_state=RANDOM_STATE
)
print('Train:', Xtr_mediana.shape, '| Test:', Xte_mediana.shape)

Train: (426, 30) | Test: (143, 30)


In [23]:
imp = SimpleImputer(strategy="median")
Xtr_mm_imp = imp.fit_transform(Xtr_mediana)
Xte_mm_imp = imp.transform(Xte_mediana)

In [24]:
acc_mediana, f1_mediana, t_mediana = lg(Xtr_mm_imp, Xte_mm_imp, ytr_mediana, yte_mediana)

print('IMPUTACIÓN MEDIANA')
print(f'Accuracy: {acc_mediana:.4f}  |  F1-macro: {f1_mediana:.4f}  |  tiempo: {t_mediana:.4f}s')

IMPUTACIÓN MEDIANA
Accuracy: 0.9161  |  F1-macro: 0.9101  |  tiempo: 0.0077s


In [25]:
acc_mediana, f1_mediana, t_mediana = rfc(Xtr_mm_imp, Xte_mm_imp, ytr_mediana, yte_mediana)

print('IMPUTACIÓN MEDIANA')
print(f'Accuracy: {acc_mediana:.4f}  |  F1-macro: {f1_mediana:.4f}  |  tiempo: {t_mediana:.4f}s')

IMPUTACIÓN MEDIANA
Accuracy: 0.9301  |  F1-macro: 0.9256  |  tiempo: 0.0171s


Moda

In [26]:
X_moda = X.copy()
y_moda = y.copy()
print('X shape:', X_moda.shape, '| y shape:', y_moda.shape)

X shape: (569, 30) | y shape: (569,)


In [27]:
Xtr_moda, Xte_moda, ytr_moda, yte_moda = train_test_split(
    X_moda, y_moda, test_size=0.25, stratify=y_moda, random_state=RANDOM_STATE
)
print('Train:', Xtr_moda.shape, '| Test:', Xte_moda.shape)

Train: (426, 30) | Test: (143, 30)


In [28]:
imp = SimpleImputer(strategy="most_frequent")
Xtr_mo_imp = imp.fit_transform(Xtr_moda)
Xte_mo_imp = imp.transform(Xte_moda)

In [29]:
acc_moda, f1_moda, t_moda = lg(Xtr_mo_imp, Xte_mo_imp, ytr_moda, yte_moda)

print('IMPUTACIÓN MODA')
print(f'Accuracy: {acc_moda:.4f}  |  F1-macro: {f1_moda:.4f}  |  tiempo: {t_moda:.4f}s')

IMPUTACIÓN MODA
Accuracy: 0.9231  |  F1-macro: 0.9172  |  tiempo: 0.0031s


In [30]:
acc_moda, f1_moda, t_moda = rfc(Xtr_mo_imp, Xte_mo_imp, ytr_moda, yte_moda)

print('IMPUTACIÓN MODA')
print(f'Accuracy: {acc_moda:.4f}  |  F1-macro: {f1_moda:.4f}  |  tiempo: {t_moda:.4f}s')

IMPUTACIÓN MODA
Accuracy: 0.9441  |  F1-macro: 0.9400  |  tiempo: 0.0148s


## Imputación avanzada

### Definición de funciones

In [31]:
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

class KMeansMissingImputer(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters=5, max_iter=10, tol=1e-4, random_state=42):
        self.n_clusters = n_clusters
        self.max_iter = max_iter
        self.tol = tol
        self.random_state = random_state
        self.centroids_ = None

    def fit(self, X, y=None):
        # Convert to numpy array if it's a DataFrame
        X = np.array(X)
        n_samples, n_features = X.shape
        
        rng = np.random.RandomState(self.random_state)
        
        # 1. Initialization: Pick k random points as initial centroids
        # If the chosen point has NaNs, fill them with a random value 
        # from that column to ensure centroids start clean.
        random_idx = rng.permutation(n_samples)[:self.n_clusters]
        self.centroids_ = X[random_idx].copy()
        
        # Handle NaNs in initial centroids
        for j in range(n_features):
            mask = np.isnan(self.centroids_[:, j])
            if mask.any():
                # Fill with random valid value from column j
                valid_values = X[~np.isnan(X[:, j]), j]
                if len(valid_values) > 0:
                    self.centroids_[mask, j] = rng.choice(valid_values, size=mask.sum())
                else:
                    self.centroids_[mask, j] = 0 # Fallback if col is empty

        # 2. The Main Loop
        for iteration in range(self.max_iter):
            old_centroids = self.centroids_.copy()
            
            # --- E-Step: Assign clusters (Partial Distance) ---
            labels = self._assign_clusters(X)
            
            # --- M-Step: Update centroids ---
            for k in range(self.n_clusters):
                # Get all points assigned to cluster k
                cluster_points = X[labels == k]
                
                if len(cluster_points) > 0:
                    # Calculate mean IGNORING NaNs (nanmean)
                    # This naturally handles the missing data update
                    new_center = np.nanmean(cluster_points, axis=0)
                    
                    # If a dimension is all NaNs for this cluster, keep old value
                    nan_mask = np.isnan(new_center)
                    self.centroids_[k, ~nan_mask] = new_center[~nan_mask]
            
            # Check for convergence
            shift = np.sqrt(np.nansum((self.centroids_ - old_centroids) ** 2))
            if shift < self.tol:
                break
                
        return self

    def transform(self, X):
        X = np.array(X)
        X_imputed = X.copy()
        
        # 1. Find closest cluster for each point (using Partial Distance)
        labels = self._assign_clusters(X)
        
        # 2. Fill NaNs with the centroid value of the assigned cluster
        for k in range(self.n_clusters):
            # Mask for points in this cluster
            cluster_mask = (labels == k)
            
            # Mask for missing values in these points
            nan_mask = np.isnan(X_imputed[cluster_mask])
            
            # We need to broadcast the centroid to the shape of the cluster subset
            centroid_vals = self.centroids_[k]
            
            # Fill ONLY the missing values
            # We iterate columns to handle the shape mismatch easily
            subset = X_imputed[cluster_mask]
            for col in range(X.shape[1]):
                col_nan = np.isnan(subset[:, col])
                subset[col_nan, col] = centroid_vals[col]
            
            X_imputed[cluster_mask] = subset
            
        return X_imputed

    def _assign_clusters(self, X):
        n_samples, n_features = X.shape
        distances = np.zeros((n_samples, self.n_clusters))
        
        for k in range(self.n_clusters):
            centroid = self.centroids_[k]
            
            # Difference (X - C)
            diff = X - centroid # (n_samples, n_features)
            sq_diff = diff ** 2
            
            # Mask where data is missing (in either X or Centroid)
            # We treat NaN difference as 0 for the sum
            nan_mask = np.isnan(sq_diff)
            sq_diff[nan_mask] = 0
            
            # Count valid dimensions for each point
            # (How many columns were NOT nan)
            valid_counts = n_features - nan_mask.sum(axis=1)
            
            # Avoid division by zero
            valid_counts[valid_counts == 0] = 1 
            
            # Partial Euclidean Distance
            # Sum of squared errors * (Total Dims / Valid Dims)
            # This scales the distance so points with missing data aren't "closer"
            d_squared = np.sum(sq_diff, axis=1) * (n_features / valid_counts)
            distances[:, k] = d_squared
            
        # Return index of closest centroid
        return np.argmin(distances, axis=1)

### KNNImputer

In [32]:
X_knn = X.copy()
y_knn = y.copy()
print('X shape:', X_knn.shape, '| y shape:', y_knn.shape)

X shape: (569, 30) | y shape: (569,)


In [33]:
Xtr_knn, Xte_knn, ytr_knn, yte_knn = train_test_split(
    X_knn, y_knn, test_size=0.25, stratify=y_knn, random_state=RANDOM_STATE
)
print('Train:', Xtr_knn.shape, '| Test:', Xte_knn.shape)

Train: (426, 30) | Test: (143, 30)


In [34]:
for k in range(5, 21):
	imp = KNNImputer(n_neighbors=k)
	Xtr_knn_imp = imp.fit_transform(Xtr_knn)
	Xte_knn_imp = imp.transform(Xte_knn)

	acc_knn, f1_knn, t_knn = lg(Xtr_knn_imp, Xte_knn_imp, ytr_knn, yte_knn)
	print('IMPUTACIÓN KNN-5 LG(', k, ')')
	print(f'Accuracy: {acc_knn:.6f}  |  F1-macro: {f1_knn:.6f}  |  tiempo: {t_knn:.6f}s\n')


	acc_knn, f1_knn, t_knn = rfc(Xtr_knn_imp, Xte_knn_imp, ytr_knn, yte_knn)
	print('IMPUTACIÓN KNN-5 RF(', k, ')')
	print(f'Accuracy: {acc_knn:.6f}  |  F1-macro: {f1_knn:.6f}  |  tiempo: {t_knn:.6f}s\n\n')

IMPUTACIÓN KNN-5 LG( 5 )
Accuracy: 0.944056  |  F1-macro: 0.940042  |  tiempo: 0.001907s

IMPUTACIÓN KNN-5 RF( 5 )
Accuracy: 0.951049  |  F1-macro: 0.947737  |  tiempo: 0.014511s


IMPUTACIÓN KNN-5 LG( 6 )
Accuracy: 0.937063  |  F1-macro: 0.932281  |  tiempo: 0.001724s

IMPUTACIÓN KNN-5 RF( 6 )
Accuracy: 0.944056  |  F1-macro: 0.940491  |  tiempo: 0.014555s


IMPUTACIÓN KNN-5 LG( 7 )
Accuracy: 0.944056  |  F1-macro: 0.940042  |  tiempo: 0.001844s

IMPUTACIÓN KNN-5 RF( 7 )
Accuracy: 0.958042  |  F1-macro: 0.955031  |  tiempo: 0.014353s


IMPUTACIÓN KNN-5 LG( 8 )
Accuracy: 0.951049  |  F1-macro: 0.947737  |  tiempo: 0.001878s

IMPUTACIÓN KNN-5 RF( 8 )
Accuracy: 0.951049  |  F1-macro: 0.947330  |  tiempo: 0.011968s


IMPUTACIÓN KNN-5 LG( 9 )
Accuracy: 0.951049  |  F1-macro: 0.947737  |  tiempo: 0.001655s

IMPUTACIÓN KNN-5 RF( 9 )
Accuracy: 0.951049  |  F1-macro: 0.947737  |  tiempo: 0.014130s


IMPUTACIÓN KNN-5 LG( 10 )
Accuracy: 0.944056  |  F1-macro: 0.940491  |  tiempo: 0.001761s

IMPU

In [35]:
for k in range(5, 21):
	imp = KNNImputer(n_neighbors=k, weights='distance')
	Xtr_knnw_imp = imp.fit_transform(Xtr_knn)
	Xte_knnw_imp = imp.transform(Xte_knn)

	acc_knnw, f1_knnw, t_knnw = lg(Xtr_knnw_imp, Xte_knnw_imp, ytr_knn, yte_knn)
	print('IMPUTACIÓN KNNW-', k, 'LG')
	print(f'Accuracy: {acc_knnw:.6f}  |  F1-macro: {f1_knnw:.6f}  |  tiempo: {t_knnw:.6f}s\n')


	acc_knnw, f1_knnw, t_knnw = rfc(Xtr_knnw_imp, Xte_knnw_imp, ytr_knn, yte_knn)
	print('IMPUTACIÓN KNNW-', k, 'RF')
	print(f'Accuracy: {acc_knnw:.6f}  |  F1-macro: {f1_knnw:.6f}  |  tiempo: {t_knnw:.6f}s\n\n')

IMPUTACIÓN KNNW- 5 LG
Accuracy: 0.930070  |  F1-macro: 0.925052  |  tiempo: 0.002521s

IMPUTACIÓN KNNW- 5 RF
Accuracy: 0.923077  |  F1-macro: 0.918468  |  tiempo: 0.013952s


IMPUTACIÓN KNNW- 6 LG
Accuracy: 0.930070  |  F1-macro: 0.925052  |  tiempo: 0.002036s

IMPUTACIÓN KNNW- 6 RF
Accuracy: 0.930070  |  F1-macro: 0.926136  |  tiempo: 0.013379s


IMPUTACIÓN KNNW- 7 LG
Accuracy: 0.930070  |  F1-macro: 0.925052  |  tiempo: 0.002068s

IMPUTACIÓN KNNW- 7 RF
Accuracy: 0.923077  |  F1-macro: 0.918468  |  tiempo: 0.014202s


IMPUTACIÓN KNNW- 8 LG
Accuracy: 0.937063  |  F1-macro: 0.932804  |  tiempo: 0.002084s

IMPUTACIÓN KNNW- 8 RF
Accuracy: 0.923077  |  F1-macro: 0.918468  |  tiempo: 0.014306s


IMPUTACIÓN KNNW- 9 LG
Accuracy: 0.937063  |  F1-macro: 0.932804  |  tiempo: 0.001753s

IMPUTACIÓN KNNW- 9 RF
Accuracy: 0.930070  |  F1-macro: 0.925614  |  tiempo: 0.013326s


IMPUTACIÓN KNNW- 10 LG
Accuracy: 0.937063  |  F1-macro: 0.932804  |  tiempo: 0.001681s

IMPUTACIÓN KNNW- 10 RF
Accuracy: 0.94

### K-Means

In [36]:
X_means = X.copy()
y_means = y.copy()
print('X shape:', X_means.shape, '| y shape:', y_means.shape)

X shape: (569, 30) | y shape: (569,)


In [37]:
Xtr_means, Xte_means, ytr_means, yte_means = train_test_split(
    X_means, y_means, test_size=0.25, stratify=y_means, random_state=RANDOM_STATE
)
print('Train:', Xtr_means.shape, '| Test:', Xte_means.shape)

Train: (426, 30) | Test: (143, 30)


In [38]:
for k in range(5, 21):
	imp = KMeansMissingImputer(n_clusters=k)
	Xtr_means_imp = imp.fit_transform(Xtr_means)
	Xte_means_imp = imp.transform(Xte_means)

	acc_means, f1_means, t_means = lg(Xtr_means_imp, Xte_means_imp, ytr_means, yte_means)
	print('IMPUTACIÓN KM-', k, 'LG')
	print(f'Accuracy: {acc_means:.6f}  |  F1-macro: {f1_means:.6f}  |  tiempo: {t_means:.6f}s\n')


	acc_means, f1_means, t_means = rfc(Xtr_means_imp, Xte_means_imp, ytr_means, yte_means)
	print('IMPUTACIÓN KM-', k, 'RF')
	print(f'Accuracy: {acc_means:.6f}  |  F1-macro: {f1_means:.6f}  |  tiempo: {t_means:.6f}s\n\n')

IMPUTACIÓN KM- 5 LG
Accuracy: 0.937063  |  F1-macro: 0.932804  |  tiempo: 0.004524s

IMPUTACIÓN KM- 5 RF
Accuracy: 0.930070  |  F1-macro: 0.925052  |  tiempo: 0.017874s


IMPUTACIÓN KM- 6 LG
Accuracy: 0.930070  |  F1-macro: 0.925614  |  tiempo: 0.001941s

IMPUTACIÓN KM- 6 RF
Accuracy: 0.930070  |  F1-macro: 0.925614  |  tiempo: 0.014282s


IMPUTACIÓN KM- 7 LG
Accuracy: 0.944056  |  F1-macro: 0.940491  |  tiempo: 0.002033s

IMPUTACIÓN KM- 7 RF
Accuracy: 0.930070  |  F1-macro: 0.926136  |  tiempo: 0.014229s


IMPUTACIÓN KM- 8 LG
Accuracy: 0.944056  |  F1-macro: 0.940491  |  tiempo: 0.002062s

IMPUTACIÓN KM- 8 RF
Accuracy: 0.937063  |  F1-macro: 0.933292  |  tiempo: 0.014265s


IMPUTACIÓN KM- 9 LG
Accuracy: 0.944056  |  F1-macro: 0.940491  |  tiempo: 0.001899s

IMPUTACIÓN KM- 9 RF
Accuracy: 0.944056  |  F1-macro: 0.940491  |  tiempo: 0.014378s


IMPUTACIÓN KM- 10 LG
Accuracy: 0.951049  |  F1-macro: 0.947737  |  tiempo: 0.001972s

IMPUTACIÓN KM- 10 RF
Accuracy: 0.937063  |  F1-macro: 0.932

  new_center = np.nanmean(cluster_points, axis=0)
  new_center = np.nanmean(cluster_points, axis=0)


IMPUTACIÓN KM- 13 RF
Accuracy: 0.944056  |  F1-macro: 0.940491  |  tiempo: 0.014649s


IMPUTACIÓN KM- 14 LG
Accuracy: 0.937063  |  F1-macro: 0.932804  |  tiempo: 0.001890s

IMPUTACIÓN KM- 14 RF
Accuracy: 0.930070  |  F1-macro: 0.925614  |  tiempo: 0.014588s


IMPUTACIÓN KM- 15 LG
Accuracy: 0.937063  |  F1-macro: 0.932804  |  tiempo: 0.001846s

IMPUTACIÓN KM- 15 RF
Accuracy: 0.923077  |  F1-macro: 0.919022  |  tiempo: 0.014380s


IMPUTACIÓN KM- 16 LG
Accuracy: 0.937063  |  F1-macro: 0.932804  |  tiempo: 0.001703s

IMPUTACIÓN KM- 16 RF
Accuracy: 0.923077  |  F1-macro: 0.919022  |  tiempo: 0.014322s


IMPUTACIÓN KM- 17 LG
Accuracy: 0.937063  |  F1-macro: 0.932804  |  tiempo: 0.001900s

IMPUTACIÓN KM- 17 RF
Accuracy: 0.930070  |  F1-macro: 0.926136  |  tiempo: 0.013904s


IMPUTACIÓN KM- 18 LG
Accuracy: 0.944056  |  F1-macro: 0.940042  |  tiempo: 0.002285s

IMPUTACIÓN KM- 18 RF
Accuracy: 0.937063  |  F1-macro: 0.933292  |  tiempo: 0.014324s


IMPUTACIÓN KM- 19 LG
Accuracy: 0.944056  |  F1-m

  new_center = np.nanmean(cluster_points, axis=0)
  new_center = np.nanmean(cluster_points, axis=0)


### SVM

In [39]:
X_svm = X.copy()
y_svm = y.copy()

In [40]:
Xtr_svm, Xte_svm, ytr_svm, yte_svm = train_test_split(
    X_svm, y_svm, test_size=0.25, stratify=y_svm, random_state=RANDOM_STATE
)
print('Train:', Xtr_svm.shape, '| Test:', Xte_svm.shape)

Train: (426, 30) | Test: (143, 30)


In [54]:
svm_pipeline = make_pipeline(
    StandardScaler(),
    LinearSVR(random_state=RANDOM_STATE, dual="auto", tol=0.2, max_iter=50000, C=0.1)
)

# 2. Pass this pipeline into the Imputer
imp = IterativeImputer(
    estimator=svm_pipeline, 
    max_iter=10, 
    random_state=RANDOM_STATE
)

# The output will be in the ORIGINAL scale (because the pipeline is internal)
Xtr_s_imp = imp.fit_transform(Xtr_svm)
Xte_s_imp = imp.transform(Xte_svm)

In [55]:
acc_svm, f1_svm, t_svm = lg(Xtr_s_imp, Xte_s_imp, ytr_svm, yte_svm)
print('IMPUTACIÓN SVM LG')
print(f'Accuracy: {acc_svm:.6f}  |  F1-macro: {f1_svm:.6f}  |  tiempo: {t_svm:.6f}s\n')


IMPUTACIÓN SVM LG
Accuracy: 0.902098  |  F1-macro: 0.896591  |  tiempo: 0.007309s



In [56]:
acc_svm, f1_svm, t_svm = rfc(Xtr_s_imp, Xte_s_imp, ytr_svm, yte_svm)
print('IMPUTACIÓN SVM RF')
print(f'Accuracy: {acc_svm:.6f}  |  F1-macro: {f1_svm:.6f}  |  tiempo: {t_svm:.6f}s\n\n')

IMPUTACIÓN SVM RF
Accuracy: 0.923077  |  F1-macro: 0.919022  |  tiempo: 0.017354s


