# Práctica 2

Objetivo: comprender cada técnica de *preparación de datos* y *selección de características*.



**Contenido**
1. Imports y utilidades
2. Línea base (escalado + regresión logística)
3. Imputación (comparación con eliminar filas perdidas)
4. Selección tipo filtro (f\_classif y chi2)
5. RFECV (eliminación recursiva con validación cruzada)
6. SelectFromModel (L1 y Random Forest)
7. Selección de instancias
8. (Opcional) Demostración con `Pipeline`
9. (Opcional) Mini ejemplo de **regresión**

## 1) Imports y utilidades

In [2]:
# 1) Configuración y carga del dataset (clasificación)
import warnings, time
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

# Diferentes datasets de clasificación y regresión que se pueden usar
from sklearn.datasets import load_breast_cancer, fetch_california_housing, load_diabetes

# Algunas utilidades
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.metrics import accuracy_score, f1_score, mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LogisticRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.feature_selection import SelectKBest, f_classif, f_regression, chi2, RFECV, SelectFromModel

RANDOM_STATE = 0
np.random.seed(RANDOM_STATE)

def standardize_train_test(Xtr, Xte):
    sc = StandardScaler()
    return sc.fit_transform(Xtr), sc.transform(Xte)

def simulate_missingness(X, missing_rate=0.05, seed=RANDOM_STATE):
    rng = np.random.RandomState(seed)
    X2 = X.astype(float).copy()
    n, d = X2.shape
    m = int(missing_rate * n * d)
    idx = rng.choice(n*d, m, replace=False)
    X2[idx // d, idx % d] = np.nan
    return X2

# Cargamos un problema de clasificación: Breast Cancer (binaria)
data = load_breast_cancer()
X, y = data.data, data.target
print('X shape:', X.shape, '| y shape:', y.shape)

# Partición train/test estratificada
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=RANDOM_STATE
)
print('Train:', X_train.shape, '| Test:', X_test.shape)

X shape: (569, 30) | y shape: (569,)
Train: (426, 30) | Test: (143, 30)


## 2) Línea base (escalado + clasificador sencillo)
Entrenamos sin selección ni imputación en un dataset sin valores perdidos.

In [None]:
# Escalado
Xtr_s, Xte_s = standardize_train_test(X_train, X_test)

# Clasificador ligero (rápido en aula)
clf_base = LogisticRegression(penalty='l2', solver='liblinear', random_state=RANDOM_STATE)
t0 = time.perf_counter(); clf_base.fit(Xtr_s, y_train); t_base = time.perf_counter() - t0
yp = clf_base.predict(Xte_s)
acc_base = accuracy_score(y_test, yp)
f1_base = f1_score(y_test, yp, average='macro')

print('BASELINE')
print(f'Accuracy: {acc_base:.4f}  |  F1-macro: {f1_base:.4f}  |  tiempo: {t_base:.3f}s')

BASELINE
Accuracy: 0.9580  |  F1-macro: 0.9550  |  tiempo: 0.018s


## 3) Imputación (comparación con eliminar filas perdidas)
Simulamos un **5%** de valores perdidos y comparamos:
- **Eliminar filas con NaN** (train y test por separado)
- **Imputación simple** (media)
- **Imputación KNN** (k=5)

In [3]:
# Simulamos valores perdidos
Xtr_m = simulate_missingness(X_train, 0.05)
Xte_m = simulate_missingness(X_test, 0.05)
res_imput = []

In [4]:
# A) Eliminar filas con NaN (cuidado: reducimos datos)
mask_tr = ~np.isnan(Xtr_m).any(axis=1)
mask_te = ~np.isnan(Xte_m).any(axis=1)
Xtr_drop, ytr_drop = Xtr_m[mask_tr], y_train[mask_tr]
Xte_drop, yte_drop = Xte_m[mask_te], y_test[mask_te]
Xtr_s, Xte_s = standardize_train_test(Xtr_drop, Xte_drop)
clf = LogisticRegression(solver="liblinear", random_state=RANDOM_STATE)
t0 = time.perf_counter()
clf.fit(Xtr_s, ytr_drop)
t = time.perf_counter() - t0
yp = clf.predict(Xte_s)
res_imput.append(
    [
        "Eliminar filas",
        Xtr_drop.shape[0],
        Xte_drop.shape[0],
        accuracy_score(yte_drop, yp),
        f1_score(yte_drop, yp, average="macro"),
        t,
    ]
)

In [5]:
# B) SimpleImputer (media)
imp = SimpleImputer(strategy="mean")
Xtr_imp = imp.fit_transform(Xtr_m)
Xte_imp = imp.transform(Xte_m)
Xtr_s, Xte_s = standardize_train_test(Xtr_imp, Xte_imp)
clf = LogisticRegression(solver="liblinear", random_state=RANDOM_STATE)
t0 = time.perf_counter()
clf.fit(Xtr_s, y_train)
t = time.perf_counter() - t0
yp = clf.predict(Xte_s)
res_imput.append(
    [
        "Imputación: media",
        Xtr_imp.shape[0],
        Xte_imp.shape[0],
        accuracy_score(y_test, yp),
        f1_score(y_test, yp, average="macro"),
        t,
    ]
)

In [6]:
# C) KNNImputer (k=5)
imp = KNNImputer(n_neighbors=5)
Xtr_imp = imp.fit_transform(Xtr_m)
Xte_imp = imp.transform(Xte_m)
Xtr_s, Xte_s = standardize_train_test(Xtr_imp, Xte_imp)
clf = LogisticRegression(solver="liblinear", random_state=RANDOM_STATE)
t0 = time.perf_counter()
clf.fit(Xtr_s, y_train)
t = time.perf_counter() - t0
yp = clf.predict(Xte_s)
res_imput.append(
    [
        "Imputación: KNN (k=5)",
        Xtr_imp.shape[0],
        Xte_imp.shape[0],
        accuracy_score(y_test, yp),
        f1_score(y_test, yp, average="macro"),
        t,
    ]
)


In [7]:
# OBLIGATORIO: Añade otro método de imputación básico
# Utilizamos imputación con la mediana
imp = SimpleImputer(strategy="median")
Xtr_imp = imp.fit_transform(Xtr_m)
Xte_imp = imp.transform(Xte_m)
Xtr_s, Xte_s = standardize_train_test(Xtr_imp, Xte_imp)
clf = LogisticRegression(solver="liblinear", random_state=RANDOM_STATE)
t0 = time.perf_counter()
clf.fit(Xtr_s, y_train)
t = time.perf_counter() - t0
yp = clf.predict(Xte_s)
res_imput.append(
    [
        "Imputación: mediana",
        Xtr_imp.shape[0],
        Xte_imp.shape[0],
        accuracy_score(y_test, yp),
        f1_score(y_test, yp, average="macro"),
        t,
    ]
)

In [8]:
# OBLIGATORIO: Añade otro método de imputación avanzado
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.svm import SVR

imp = IterativeImputer(
    estimator=SVR(kernel="rbf", C=10, epsilon=0.1, gamma="scale"),
    initial_strategy="median",
    max_iter=10,
    random_state=0,
)

Xtr_imp = imp.fit_transform(Xtr_m)
Xte_imp = imp.transform(Xte_m)
Xtr_s, Xte_s = standardize_train_test(Xtr_imp, Xte_imp)
clf = LogisticRegression(solver="liblinear", random_state=RANDOM_STATE)
t0 = time.perf_counter()
clf.fit(Xtr_s, y_train)
t = time.perf_counter() - t0
yp = clf.predict(Xte_s)
res_imput.append(
    [
        "Imputación: SVR",
        Xtr_imp.shape[0],
        Xte_imp.shape[0],
        accuracy_score(y_test, yp),
        f1_score(y_test, yp, average="macro"),
        t,
    ]
)

In [9]:
# Mostramos resultados
df_imput = pd.DataFrame(res_imput, columns=['Tratamiento', 'n_train', 'n_test', 'Accuracy', 'F1-macro', 'tiempo_entreno_s'])
df_imput

Unnamed: 0,Tratamiento,n_train,n_test,Accuracy,F1-macro,tiempo_entreno_s
0,Eliminar filas,90,33,0.939394,0.93797,0.00325
1,Imputación: media,426,143,0.965035,0.962378,0.003671
2,Imputación: KNN (k=5),426,143,0.958042,0.955031,0.003845
3,Imputación: mediana,426,143,0.958042,0.955031,0.002941
4,Imputación: SVR,426,143,0.937063,0.932281,0.003711


In [5]:
# OPCIONAL: Usa un dataset de regresión, repite todos los métodos de imputación y muestra los resultados.
# No olvides adaptar todas las métricas: Accuracy/F1 solo sirven para problemas de clasificación
from sklearn.linear_model import LinearRegression

data2 = fetch_california_housing()
X2, y2 = data2.data, data2.target
print('X shape:', X2.shape, '| y shape:', y2.shape)

# Partición train/test estratificada
X2_train, X2_test, y2_train, y2_test = train_test_split(
    X2, y2, test_size=0.25, random_state=RANDOM_STATE
)

X2tr_m = simulate_missingness(X2_train, 0.05)
X2te_m = simulate_missingness(X2_test, 0.05)
res_imput2 = []

# B) SimpleImputer (media)
imp2 = SimpleImputer(strategy="mean")
X2tr_imp = imp2.fit_transform(X2tr_m)
X2te_imp = imp2.transform(X2te_m)
X2tr_s, X2te_s = standardize_train_test(X2tr_imp, X2te_imp)
clf2 = LinearRegression()
t02 = time.perf_counter()
clf2.fit(X2tr_s, y2_train)
t2 = time.perf_counter() - t02
yp2 = clf2.predict(X2te_s)
res_imput2.append(
    [
        "Imputación: media",
        X2tr_imp.shape[0],
        X2te_imp.shape[0],
        mean_squared_error(y2_test, yp2),
        r2_score(y2_test, yp2),
        t2,
    ]
)

# C) KNNImputer (k=5)
imp2 = KNNImputer(n_neighbors=5)
X2tr_imp = imp2.fit_transform(X2tr_m)
X2te_imp = imp2.transform(X2te_m)
X2tr_s, X2te_s = standardize_train_test(X2tr_imp, X2te_imp)
clf2 = LinearRegression()
t02 = time.perf_counter()
clf2.fit(X2tr_s, y2_train)
t2 = time.perf_counter() - t02
yp2 = clf2.predict(X2te_s)
res_imput2.append(
    [
        "Imputación: KNN (k=5)",
        X2tr_imp.shape[0],
        X2te_imp.shape[0],
        mean_squared_error(y2_test, yp2),
        r2_score(y2_test, yp2),
        t2,
    ]
)

# Utilizamos imputación con la mediana
imp2 = SimpleImputer(strategy="median")
X2tr_imp = imp2.fit_transform(X2tr_m)
X2te_imp = imp2.transform(X2te_m)
X2tr_s, X2te_s = standardize_train_test(X2tr_imp, X2te_imp)
clf2 = LinearRegression()
t02 = time.perf_counter()
clf2.fit(X2tr_s, y2_train)
t2 = time.perf_counter() - t02
y2p = clf2.predict(X2te_s)
res_imput2.append(
    [
        "Imputación: mediana",
        X2tr_imp.shape[0],
        X2te_imp.shape[0],
        mean_squared_error(y2_test, yp2),
        r2_score(y2_test, yp2),
        t2,
    ]
)

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.svm import SVR

imp2 = IterativeImputer(
    estimator=SVR(kernel="rbf", C=10, epsilon=0.1, gamma="scale"),
    initial_strategy="median",
    max_iter=10,
    random_state=0,
)

X2tr_imp = imp.fit_transform(X2tr_m)
X2te_imp = imp.transform(X2te_m)
X2tr_s, X2te_s = standardize_train_test(X2tr_imp, X2te_imp)
clf2 = LinearRegression()
t02 = time.perf_counter()
clf2.fit(X2tr_s, y2_train)
t2 = time.perf_counter() - t02
yp2 = clf2.predict(X2te_s)
res_imput2.append(
    [
        "Imputación: SVR",
        X2tr_imp.shape[0],
        X2te_imp.shape[0],
        mean_squared_error(y2_test, yp2),
        r2_score(y2_test, yp2),
        t2,
    ]
)

X shape: (20640, 8) | y shape: (20640,)


### ENTREGABLE: Escribe en el documento de la práctica (formato libre) un texto explicando los resultados obtenidos y decidiendo cuál es el mejor método de imputación en cada caso

In [60]:
# Mostramos resultados
df_imput2 = pd.DataFrame(res_imput2, columns=['Tratamiento', 'n_train', 'n_test', 'MSE', 'R2', 'tiempo_entreno_s'])
df_imput2

Unnamed: 0,Tratamiento,n_train,n_test,MSE,R2,tiempo_entreno_s
0,Imputación: media,15480,5160,0.632049,0.521845,0.003214
1,Imputación: KNN (k=5),15480,5160,0.575216,0.56484,0.002768
2,Imputación: mediana,15480,5160,0.575216,0.56484,0.003675
3,Imputación: SVR,15480,5160,0.635165,0.519488,0.002326


## 4) Selección tipo filtro
Comparamos *sin selección* vs **SelectKBest** con:
- `f_classif` (general)
- `chi2` (requiere no-negatividad, debemos aplicar `MinMaxScaler` antes de usarlo)

In [10]:
# Usamos los datos SIN NaN (X_train / X_test originales)
imp = SimpleImputer()  # por seguridad
Xtr = imp.fit_transform(X_train)
Xte = imp.transform(X_test)
Xtr_s0, Xte_s0 = standardize_train_test(Xtr, Xte)

# Baseline
clf = LogisticRegression(solver="liblinear", random_state=RANDOM_STATE)
t0 = time.perf_counter()
clf.fit(Xtr_s0, y_train)
t_base2 = time.perf_counter() - t0
yp = clf.predict(Xte_s0)
acc0 = accuracy_score(y_test, yp)
f10 = f1_score(y_test, yp, average="macro")

rows = [["Sin selección", Xtr.shape[1], acc0, f10, t_base2]]


In [11]:
# SelectKBest f_classif (k=10)
k = min(10, Xtr.shape[1])
sel = SelectKBest(score_func=f_classif, k=k)
Xtr_k = sel.fit_transform(Xtr_s0, y_train)
Xte_k = sel.transform(Xte_s0)
clf = LogisticRegression(solver="liblinear", random_state=RANDOM_STATE)
t0 = time.perf_counter()
clf.fit(Xtr_k, y_train)
t1 = time.perf_counter() - t0
yp = clf.predict(Xte_k)
rows.append(
    [
        f"SelectKBest f_classif (k={k})",
        k,
        accuracy_score(y_test, yp),
        f1_score(y_test, yp, average="macro"),
        t1,
    ]
)


In [12]:
# SelectKBest chi2 (k=10) → MinMax
from sklearn.preprocessing import MinMaxScaler
mm = MinMaxScaler()
Xtr_mm = mm.fit_transform(Xtr)
Xte_mm = mm.transform(Xte)
sel = SelectKBest(score_func=chi2, k=k)
Xtr_k2 = sel.fit_transform(Xtr_mm, y_train)
Xte_k2 = sel.transform(Xte_mm)
clf = LogisticRegression(solver='liblinear', random_state=RANDOM_STATE)
t0 = time.perf_counter(); clf.fit(Xtr_k2, y_train); t2 = time.perf_counter() - t0
yp = clf.predict(Xte_k2)
rows.append([f'SelectKBest chi2 (k={k})', k, accuracy_score(y_test, yp), f1_score(y_test, yp, average='macro'), t2])



In [13]:
# OBLIGATORIO: Prueba con diferentes valores de n_features
k = min(5, Xtr.shape[1])
sel = SelectKBest(score_func=f_classif, k=k)
Xtr_k = sel.fit_transform(Xtr_s0, y_train)
Xte_k = sel.transform(Xte_s0)
clf = LogisticRegression(solver="liblinear", random_state=RANDOM_STATE)
t0 = time.perf_counter()
clf.fit(Xtr_k, y_train)
t1 = time.perf_counter() - t0
yp = clf.predict(Xte_k)
rows.append(
    [
        f"SelectKBest f_classif (k={k})",
        k,
        accuracy_score(y_test, yp),
        f1_score(y_test, yp, average="macro"),
        t1,
    ]
)

Xtr_mm = mm.fit_transform(Xtr)
Xte_mm = mm.transform(Xte)
sel = SelectKBest(score_func=chi2, k=k)
Xtr_k2 = sel.fit_transform(Xtr_mm, y_train)
Xte_k2 = sel.transform(Xte_mm)
clf = LogisticRegression(solver='liblinear', random_state=RANDOM_STATE)
t0 = time.perf_counter(); clf.fit(Xtr_k2, y_train); t2 = time.perf_counter() - t0
yp = clf.predict(Xte_k2)
rows.append([f'SelectKBest chi2 (k={k})', k, accuracy_score(y_test, yp), f1_score(y_test, yp, average='macro'), t2])

k = min(20, Xtr.shape[1])
sel = SelectKBest(score_func=f_classif, k=k)
Xtr_k = sel.fit_transform(Xtr_s0, y_train)
Xte_k = sel.transform(Xte_s0)
clf = LogisticRegression(solver="liblinear", random_state=RANDOM_STATE)
t0 = time.perf_counter()
clf.fit(Xtr_k, y_train)
t1 = time.perf_counter() - t0
yp = clf.predict(Xte_k)
rows.append(
    [
        f"SelectKBest f_classif (k={k})",
        k,
        accuracy_score(y_test, yp),
        f1_score(y_test, yp, average="macro"),
        t1,
    ]
)

Xtr_mm = mm.fit_transform(Xtr)
Xte_mm = mm.transform(Xte)
sel = SelectKBest(score_func=chi2, k=k)
Xtr_k2 = sel.fit_transform(Xtr_mm, y_train)
Xte_k2 = sel.transform(Xte_mm)
clf = LogisticRegression(solver='liblinear', random_state=RANDOM_STATE)
t0 = time.perf_counter(); clf.fit(Xtr_k2, y_train); t2 = time.perf_counter() - t0
yp = clf.predict(Xte_k2)
rows.append([f'SelectKBest chi2 (k={k})', k, accuracy_score(y_test, yp), f1_score(y_test, yp, average='macro'), t2])

df_imput = pd.DataFrame(rows, columns=['Tratamiento', 'k', 'Accuracy', 'F1_score', 'tiempo_entreno_s'])
df_imput

Unnamed: 0,Tratamiento,k,Accuracy,F1_score,tiempo_entreno_s
0,Sin selección,30,0.958042,0.955031,0.003046
1,SelectKBest f_classif (k=10),10,0.951049,0.94733,0.002413
2,SelectKBest chi2 (k=10),10,0.93007,0.924451,0.001664
3,SelectKBest f_classif (k=5),5,0.93007,0.925052,0.001616
4,SelectKBest chi2 (k=5),5,0.93007,0.924451,0.001723
5,SelectKBest f_classif (k=20),20,0.965035,0.962067,0.002074
6,SelectKBest chi2 (k=20),20,0.944056,0.939045,0.001786


In [14]:
# OBLIGATORIO: Usa un segundo problema de clasificación y repite todo
# No escoger digits -> Ejemplo para usar "Wines" (Hay que buscar)
from sklearn.datasets import load_wine
data3 = load_wine()
X3, y3 = data3.data, data3.target

# Partición train/test estratificada
X3_train, X3_test, y3_train, y3_test = train_test_split(
    X3, y3, test_size=0.25, random_state=RANDOM_STATE
)

X3tr_m = simulate_missingness(X3_train, 0.05)
X3te_m = simulate_missingness(X3_test, 0.05)
res_imput3 = []

imp3 = SimpleImputer()  # por seguridad
X3tr = imp3.fit_transform(X3_train)
X3te = imp3.transform(X3_test)
X3tr_s0, X3te_s0 = standardize_train_test(X3tr, X3te)

# Baseline
clf3 = LogisticRegression(solver="liblinear", random_state=RANDOM_STATE)
t03 = time.perf_counter()
clf3.fit(X3tr_s0, y3_train)
t3_base2 = time.perf_counter() - t03
yp3 = clf3.predict(X3te_s0)
acc03 = accuracy_score(y3_test, yp3)
f103 = f1_score(y3_test, yp3, average="macro")

rows3 = [["Sin selección", X3tr.shape[1], acc03, f103, t3_base2]]


k = min(10, Xtr.shape[1])
sel3 = SelectKBest(score_func=f_classif, k=k)
X3tr_k = sel3.fit_transform(X3tr_s0, y3_train)
X3te_k = sel3.transform(X3te_s0)
clf3 = LogisticRegression(solver="liblinear", random_state=RANDOM_STATE)
t03 = time.perf_counter()
clf3.fit(X3tr_k, y3_train)
t13 = time.perf_counter() - t03
y3p = clf3.predict(X3te_k)
rows3.append(
    [
        f"SelectKBest f_classif Wines (k={k})",
        k,
        accuracy_score(y3_test, yp3),
        f1_score(y3_test, yp3, average="macro"),
        t1,
    ]
)

mm3 = MinMaxScaler()
X3tr_mm = mm3.fit_transform(X3tr)
X3te_mm = mm3.transform(X3te)
sel3 = SelectKBest(score_func=chi2, k=k)
X3tr_k2 = sel3.fit_transform(X3tr_mm, y3_train)
X3te_k2 = sel3.transform(X3te_mm)
clf3 = LogisticRegression(solver='liblinear', random_state=RANDOM_STATE)
t03 = time.perf_counter(); clf3.fit(X3tr_k2, y3_train); t23 = time.perf_counter() - t03
yp3 = clf3.predict(X3te_k2)
rows3.append([f'SelectKBest chi2 Wines (k={k})', k, accuracy_score(y3_test, yp3), f1_score(y3_test, yp3, average='macro'), t2])

k = min(5, Xtr.shape[1])
sel3 = SelectKBest(score_func=f_classif, k=k)
X3tr_k = sel3.fit_transform(X3tr_s0, y3_train)
X3te_k = sel3.transform(X3te_s0)
clf3 = LogisticRegression(solver="liblinear", random_state=RANDOM_STATE)
t03 = time.perf_counter()
clf3.fit(X3tr_k, y3_train)
t13 = time.perf_counter() - t03
yp3 = clf3.predict(X3te_k)
rows3.append(
    [
        f"SelectKBest f_classif Wines (k={k})",
        k,
        accuracy_score(y3_test, yp3),
        f1_score(y3_test, yp3, average="macro"),
        t13,
    ]
)

X3tr_mm = mm3.fit_transform(X3tr)
X3te_mm = mm3.transform(X3te)
sel3 = SelectKBest(score_func=chi2, k=k)
X3tr_k2 = sel3.fit_transform(X3tr_mm, y3_train)
X3te_k2 = sel3.transform(X3te_mm)
clf3 = LogisticRegression(solver='liblinear', random_state=RANDOM_STATE)
t03 = time.perf_counter() 
clf3.fit(X3tr_k2, y3_train) 
t23 = time.perf_counter() - t03
yp3 = clf3.predict(X3te_k2)
rows3.append([f'SelectKBest chi2 Wines (k={k})', k, accuracy_score(y3_test, yp3), f1_score(y3_test, yp3, average='macro'), t23])

k = min(20, X3tr.shape[1])
sel3 = SelectKBest(score_func=f_classif, k=k)
X3tr_k = sel3.fit_transform(X3tr_s0, y3_train)
X3te_k = sel3.transform(X3te_s0)
clf3 = LogisticRegression(solver="liblinear", random_state=RANDOM_STATE)
t03 = time.perf_counter()
clf3.fit(X3tr_k, y3_train)
t13 = time.perf_counter() - t03
yp3 = clf3.predict(X3te_k)
rows3.append(
    [
        f"SelectKBest f_classif Wines(k={k})",
        k,
        accuracy_score(y3_test, yp3),
        f1_score(y3_test, yp3, average="macro"),
        t13,
    ]
)

X3tr_mm = mm.fit_transform(X3tr)
X3te_mm = mm.transform(X3te)
sel3 = SelectKBest(score_func=chi2, k=k)
X3tr_k2 = sel.fit_transform(X3tr_mm, y3_train)
X3te_k2 = sel.transform(X3te_mm)
clf3 = LogisticRegression(solver='liblinear', random_state=RANDOM_STATE)
t03 = time.perf_counter(); clf3.fit(X3tr_k2, y3_train); t23 = time.perf_counter() - t03
yp3 = clf3.predict(X3te_k2)
rows3.append([f'SelectKBest chi2 Wines (k={k})', k, accuracy_score(y3_test, yp3), f1_score(y3_test, yp3, average='macro'), t23])


In [46]:
# OPCIONAL: Usa un problema de regresión, adapta lo necesario y repite todo
from sklearn.feature_selection import r_regression, mutual_info_regression
from sklearn.linear_model import LinearRegression

data4 = load_diabetes()
X4, y4 = data4.data, data4.target

# Partición train/test estratificada
X4_train, X4_test, y4_train, y4_test = train_test_split(
    X4, y4, test_size=0.25, random_state=RANDOM_STATE
)

X4tr_m = simulate_missingness(X4_train, 0.05)
X4te_m = simulate_missingness(X4_test, 0.05)
res_imput4 = []

imp4 = SimpleImputer()  # por seguridad
X4tr = imp4.fit_transform(X4_train)
X4te = imp4.transform(X4_test)
X4tr_s0, X4te_s0 = standardize_train_test(X4tr, X4te)

# Baseline
clf4 = LinearRegression()
t04 = time.perf_counter()
clf4.fit(X4tr_s0, y4_train)
t4_base2 = time.perf_counter() - t04
yp4 = clf4.predict(X4te_s0)
acc04 = mean_squared_error(y4_test, yp4)
f104 = r2_score(y4_test, yp4)

rows4 = [["Sin selección", X4tr.shape[1], acc04, f104, t4_base2]]


k = min(10, Xtr.shape[1])
sel4 = SelectKBest(score_func=r_regression, k=k)
X4tr_k = sel4.fit_transform(X4tr_s0, y4_train)
X4te_k = sel4.transform(X4te_s0)
clf4 = LinearRegression()
t04 = time.perf_counter()
clf4.fit(X4tr_k, y4_train)
t14 = time.perf_counter() - t04
y4p = clf4.predict(X4te_k)
rows4.append(
    [
        f"SelectKBest r_regression Diabetes (k={k})",
        k,
        mean_squared_error(y4_test, yp4),
        r2_score(y4_test, yp4),
        t1,
    ]
)

mm4 = MinMaxScaler()
X4tr_mm = mm4.fit_transform(X4tr)
X4te_mm = mm4.transform(X4te)
sel4 = SelectKBest(score_func=mutual_info_regression, k=k)
X4tr_k2 = sel4.fit_transform(X4tr_mm, y4_train)
X4te_k2 = sel4.transform(X4te_mm)
clf4 = LinearRegression()
t04 = time.perf_counter(); clf4.fit(X4tr_k2, y4_train); t24 = time.perf_counter() - t04
yp4 = clf4.predict(X4te_k2)
rows4.append([f'SelectKBest mutual_info_regression Diabetes (k={k})', k, mean_squared_error(y4_test, yp4), r2_score(y4_test, yp4), t2])

k = min(5, Xtr.shape[1])
sel4 = SelectKBest(score_func=r_regression, k=k)
X4tr_k = sel4.fit_transform(X4tr_s0, y4_train)
X4te_k = sel4.transform(X4te_s0)
clf4 = LinearRegression()
t04 = time.perf_counter()
clf4.fit(X4tr_k, y4_train)
t14 = time.perf_counter() - t04
yp4 = clf4.predict(X4te_k)
rows4.append(
    [
        f"SelectKBest r_regression Diabetes (k={k})",
        k,
        mean_squared_error(y4_test, yp4),
        r2_score(y4_test, yp4),
        t14,
    ]
)

X4tr_mm = mm4.fit_transform(X4tr)
X4te_mm = mm4.transform(X4te)
sel4 = SelectKBest(score_func=mutual_info_regression, k=k)
X4tr_k2 = sel4.fit_transform(X4tr_mm, y4_train)
X4te_k2 = sel4.transform(X4te_mm)
clf4 = LinearRegression()
t04 = time.perf_counter() 
clf4.fit(X4tr_k2, y4_train) 
t24 = time.perf_counter() - t04
yp4 = clf4.predict(X4te_k2)
rows4.append([f'SelectKBest mutual_info_regression Diabetes (k={k})', k, mean_squared_error(y4_test, yp4), r2_score(y4_test, yp4), t24])

k = min(20, X4tr.shape[1])
sel4 = SelectKBest(score_func=r_regression, k=k)
X4tr_k = sel4.fit_transform(X4tr_s0, y4_train)
X4te_k = sel4.transform(X4te_s0)
clf4 = LinearRegression()

t04 = time.perf_counter()
clf4.fit(X4tr_k, y4_train)
t14 = time.perf_counter() - t04
yp4 = clf4.predict(X4te_k)
rows4.append(
    [
        f"SelectKBest r_regression Diabetes(k={k})",
        k,
        mean_squared_error(y4_test, yp4),
        r2_score(y4_test, yp4),
        t14,
    ]
)

X4tr_mm = mm.fit_transform(X4tr)
X4te_mm = mm.transform(X4te)
sel4 = SelectKBest(score_func=mutual_info_regression, k=k)
X4tr_k2 = sel.fit_transform(X4tr_mm, y4_train)
X4te_k2 = sel.transform(X4te_mm)
clf4 = LinearRegression()
t04 = time.perf_counter(); clf4.fit(X4tr_k2, y4_train); t24 = time.perf_counter() - t04
yp4 = clf4.predict(X4te_k2)
rows4.append([f'SelectKBest mutual_info_regression Diabetes (k={k})', k, mean_squared_error(y4_test, yp4), r2_score(y4_test, yp4), t24])


In [None]:
pd.DataFrame(
    rows3,
    columns=["Tratamiento", "n_features", "Accuracy", "F1-macro", "tiempo_entreno_s"],
)

Unnamed: 0,Tratamiento,n_features,Accuracy,F1-macro,tiempo_entreno_s
0,Sin selección,13,1.0,1.0,0.001592
1,SelectKBest f_classif Wines (k=10),10,1.0,1.0,0.002074
2,SelectKBest chi2 Wines (k=10),10,0.977778,0.972262,0.001786
3,SelectKBest f_classif Wines (k=5),5,0.977778,0.972262,0.001551
4,SelectKBest chi2 Wines (k=5),5,0.911111,0.908267,0.001517
5,SelectKBest f_classif Wines(k=13),13,1.0,1.0,0.002362
6,SelectKBest chi2 Wines (k=13),13,0.977778,0.972262,0.00148


In [47]:
pd.DataFrame(
    rows4,
    columns=["Tratamiento", "n_features", "MSE", "R2", "tiempo_entreno_s"],
)

Unnamed: 0,Tratamiento,n_features,MSE,R2,tiempo_entreno_s
0,Sin selección,10,3180.159648,0.359409,0.000938
1,SelectKBest r_regression Diabetes (k=10),10,3180.159648,0.359409,0.002074
2,SelectKBest mutual_info_regression Diabetes (k...,10,3180.159648,0.359409,0.001786
3,SelectKBest r_regression Diabetes (k=5),5,3359.171112,0.32335,0.000796
4,SelectKBest mutual_info_regression Diabetes (k=5),5,3571.44051,0.280592,0.000869
5,SelectKBest r_regression Diabetes(k=10),10,3180.159648,0.359409,0.001039
6,SelectKBest mutual_info_regression Diabetes (k...,10,3214.9851,0.352394,0.000992


### ENTREGABLE: Escribe en el documento de la práctica (formato libre) un texto explicando los resultados obtenidos y decidiendo cuál es el mejor número de características en cada caso

## 5) RFECV
Usamos **RFECV** para encontrar automáticamente cuántas características dejar. Después reentrenamos una RL con esas características.

In [3]:
data = load_breast_cancer()
X, y = data.data, data.target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=RANDOM_STATE
)

imp = SimpleImputer(); Xtr = imp.fit_transform(X_train); Xte = imp.transform(X_test)
Xtr_s, Xte_s = standardize_train_test(Xtr, Xte)

est = LogisticRegression(solver='liblinear', random_state=RANDOM_STATE)
rfecv = RFECV(estimator=est, step=2, cv=5, scoring='f1_macro', n_jobs=-1)
t0 = time.perf_counter(); rfecv.fit(Xtr_s, y_train); t_sel = time.perf_counter() - t0
nsel = int(getattr(rfecv, 'n_features_', Xtr.shape[1]))

Xtr_sel = rfecv.transform(Xtr_s); Xte_sel = rfecv.transform(Xte_s)
final = LogisticRegression(solver='liblinear', random_state=RANDOM_STATE)
t0 = time.perf_counter(); final.fit(Xtr_sel, y_train); t_fit = time.perf_counter() - t0
yp = final.predict(Xte_sel)

print('RFECV')
print('n_features seleccionadas:', nsel)
print(f'Accuracy: {accuracy_score(y_test, yp):.4f}  |  F1-macro: {f1_score(y_test, yp, average="macro"):.4f}  |  tiempo_total: {t_sel + t_fit:.3f}s')

RFECV
n_features seleccionadas: 6
Accuracy: 0.9441  |  F1-macro: 0.9400  |  tiempo_total: 6.011s


In [4]:
# OBLIGATORIO: Compara RFECV con filtro en las mismas condiciones (mismo dataset, misma imputación/escalado, mismo número de características) y explica cuál es mejor
sel = SelectKBest(score_func=f_classif, k=nsel)
Xtr_kbest = sel.fit_transform(Xtr_s, y_train)
Xte_kbest = sel.transform(Xte_s)
clf = LogisticRegression(solver='liblinear', random_state=RANDOM_STATE)
t0 = time.perf_counter()
clf.fit(Xtr_kbest, y_train)
t_kbest = time.perf_counter() - t0
yp_kbest = clf.predict(Xte_kbest)
acc_kbest = accuracy_score(y_test, yp_kbest)
f1_kbest = f1_score(y_test, yp_kbest, average="macro")

print("Filtro")
print(f"SelectKBest (k={nsel})")
print(f"Accuracy: {acc_kbest:.4f}  |  F1-macro: {f1_kbest:.4f}  |  tiempo: {t_kbest:.3f}s")

Filtro
SelectKBest (k=6)
Accuracy: 0.9301  |  F1-macro: 0.9251  |  tiempo: 0.002s


In [None]:
# OBLIGATORIO: Eligen RFECV y filtro las mismas variables?
print('RFECV')
print(rfecv.get_support(indices=True))
print('\nFiltro: SelectKBest')
print(sel.get_support(indices=True))

RFECV
[ 7 20 21 22 23 27]

Filtro: SelectKBest
[ 0  2  7 20 22 27]


In [21]:
# OPCIONAL: Repite todo para un problema de regresión

### ENTREGABLE: Escribe en el documento de la práctica (formato libre) un texto explicando los resultados obtenidos

## 6) SelectFromModel
Primero seleccionamos características y luego reentrenamos para comparar solo el efecto de la selección.

In [10]:
imp = SimpleImputer()
Xtr = imp.fit_transform(X_train)
Xte = imp.transform(X_test)
Xtr_s, Xte_s = standardize_train_test(Xtr, Xte)


In [11]:
rows = []
# L1 como selector
sel1 = SelectFromModel(
    LogisticRegression(penalty="l1", solver="liblinear", random_state=RANDOM_STATE)
)
t0 = time.perf_counter()
sel1.fit(Xtr_s, y_train)
t_sel1 = time.perf_counter() - t0
Xtr_sel = sel1.transform(Xtr_s)
Xte_sel = sel1.transform(Xte_s)
clf = LogisticRegression(penalty="l2", solver="liblinear", random_state=RANDOM_STATE)
t0 = time.perf_counter()
clf.fit(Xtr_sel, y_train)
t_fit1 = time.perf_counter() - t0
yp = clf.predict(Xte_sel)
rows.append(
    [
        "SFM(L1 LR) + LR L2",
        int(sel1.get_support().sum()),
        accuracy_score(y_test, yp),
        f1_score(y_test, yp, average="macro"),
        t_sel1 + t_fit1,
    ]
)


In [12]:
# RandomForest como selector
sel2 = SelectFromModel(
    RandomForestClassifier(n_estimators=200, random_state=RANDOM_STATE, n_jobs=-1)
)
t0 = time.perf_counter()
sel2.fit(Xtr_s, y_train)
t_sel2 = time.perf_counter() - t0
Xtr_sel = sel2.transform(Xtr_s)
Xte_sel = sel2.transform(Xte_s)
clf = LogisticRegression(penalty="l2", solver="liblinear", random_state=RANDOM_STATE)
t0 = time.perf_counter()
clf.fit(Xtr_sel, y_train)
t_fit2 = time.perf_counter() - t0
yp = clf.predict(Xte_sel)
rows.append(
    [
        "SFM(RandomForest) + LR L2",
        int(sel2.get_support().sum()),
        accuracy_score(y_test, yp),
        f1_score(y_test, yp, average="macro"),
        t_sel2 + t_fit2,
    ]
)


In [13]:
pd.DataFrame(rows, columns=['Tratamiento', 'n_features', 'Accuracy', 'F1-macro', 'tiempo_total_s'])

Unnamed: 0,Tratamiento,n_features,Accuracy,F1-macro,tiempo_total_s
0,SFM(L1 LR) + LR L2,14,0.958042,0.955031,0.00695
1,SFM(RandomForest) + LR L2,9,0.951049,0.94733,0.430151


In [14]:
# OBLIGATORIO: Compara los tres métodos de selección de características en las mismas condiciones 
# (usando el mismo dataset, misma imputación/escalado, mismo número de características) y explica cuál es mejor
est = LogisticRegression(solver='liblinear', random_state=RANDOM_STATE)
rfecv = RFECV(estimator=est, step=2, cv=5, scoring='f1_macro', n_jobs=-1)
t0 = time.perf_counter(); rfecv.fit(Xtr_s, y_train); t_sel = time.perf_counter() - t0
nsel = int(getattr(rfecv, 'n_features_', Xtr.shape[1]))

Xtr_sel = rfecv.transform(Xtr_s); Xte_sel = rfecv.transform(Xte_s)
final = LogisticRegression(solver='liblinear', random_state=RANDOM_STATE)
t0 = time.perf_counter(); final.fit(Xtr_sel, y_train); t_fit = time.perf_counter() - t0
yp = final.predict(Xte_sel)

print('RFECV')
print('n_features seleccionadas:', nsel)
print(f'Accuracy: {accuracy_score(y_test, yp):.4f}  |  F1-macro: {f1_score(y_test, yp, average="macro"):.4f}  |  tiempo_total: {t_sel + t_fit:.3f}s')

print("-----------------------")
sel = SelectKBest(score_func=f_classif, k=nsel)
Xtr_kbest = sel.fit_transform(Xtr_s, y_train)
Xte_kbest = sel.transform(Xte_s)
clf = LogisticRegression(solver='liblinear', random_state=RANDOM_STATE)
t0 = time.perf_counter()
clf.fit(Xtr_kbest, y_train)
t_kbest = time.perf_counter() - t0
yp_kbest = clf.predict(Xte_kbest)
acc_kbest = accuracy_score(y_test, yp_kbest)
f1_kbest = f1_score(y_test, yp_kbest, average="macro")

print("Filtro")
print(f"SelectKBest (k={nsel})")
print(f"Accuracy: {acc_kbest:.4f}  |  F1-macro: {f1_kbest:.4f}  |  tiempo: {t_kbest:.3f}s")

RFECV
n_features seleccionadas: 6
Accuracy: 0.9441  |  F1-macro: 0.9400  |  tiempo_total: 5.439s
-----------------------
Filtro
SelectKBest (k=6)
Accuracy: 0.9301  |  F1-macro: 0.9251  |  tiempo: 0.002s


In [18]:
# OBLIGATORIO: Elige este método las mismas variables que los anteriores? 
print('SFM(L1 LR) + LR L2')
print(sel1.get_support(indices=True))
print('\nSFM(RandomForest) + LR L2')
print(sel2.get_support(indices=True))

SFM(L1 LR) + LR L2
[ 6  7  9 10 14 15 18 20 21 23 24 26 27 28]

SFM(RandomForest) + LR L2
[ 2  3  6  7 20 22 23 26 27]


In [28]:
# OPCIONAL: Repite todo para un problema de regresión


### ENTREGABLE: Escribe en el documento de la práctica (formato libre) un texto explicando los resultados obtenidos

## 7) Selección de instancias
Reducimos deliberadamente el tamaño del conjunto de entrenamiento y comparamos con entrenar con todo el train.



- **CNN** (Condensed Nearest Neighbour): condensa el train manteniendo representantes.
- **ENN** (Edited Nearest Neighbours): elimina ejemplos conflictivos.

> Requiere imbalanced-learn: pip install imbalanced-learn.

In [8]:
from imblearn.under_sampling import CondensedNearestNeighbour, EditedNearestNeighbours

# Preprocesado (imputación + escalado con train)
imp = SimpleImputer()
Xtr = imp.fit_transform(X_train)
Xte = imp.transform(X_test)
sc = StandardScaler()
Xtr_s = sc.fit_transform(Xtr)
Xte_s = sc.transform(Xte)

rows = []


In [55]:
# A) Todo el train
clf_full = LogisticRegression(solver="liblinear", random_state=RANDOM_STATE)
t0 = time.perf_counter()
clf_full.fit(Xtr_s, y_train)
t_full = time.perf_counter() - t0
yp_full = clf_full.predict(Xte_s)
acc_full = accuracy_score(y_test, yp_full)
f1_full = f1_score(y_test, yp_full, average="macro")
rows.append(["Todo el train", Xtr_s.shape[0], acc_full, f1_full, t_full])


In [56]:
# B) CNN (condensado)
cnn = CondensedNearestNeighbour(random_state=RANDOM_STATE)
Xtr_cnn, ytr_cnn = cnn.fit_resample(Xtr_s, y_train)
clf_cnn = LogisticRegression(solver="liblinear", random_state=RANDOM_STATE)
t0 = time.perf_counter()
clf_cnn.fit(Xtr_cnn, ytr_cnn)
t_cnn = time.perf_counter() - t0
yp_cnn = clf_cnn.predict(Xte_s)
rows.append(
    [
        "CNN (condensado)",
        Xtr_cnn.shape[0],
        accuracy_score(y_test, yp_cnn),
        f1_score(y_test, yp_cnn, average="macro"),
        t_cnn,
    ]
)

In [57]:
# C) ENN (edición)
enn = EditedNearestNeighbours()
Xtr_enn, ytr_enn = enn.fit_resample(Xtr_s, y_train)
clf_enn = LogisticRegression(solver="liblinear", random_state=RANDOM_STATE)
t0 = time.perf_counter()
clf_enn.fit(Xtr_enn, ytr_enn)
t_enn = time.perf_counter() - t0
yp_enn = clf_enn.predict(Xte_s)
rows.append(
    [
        "ENN (edición)",
        Xtr_enn.shape[0],
        accuracy_score(y_test, yp_enn),
        f1_score(y_test, yp_enn, average="macro"),
        t_enn,
    ]
)


In [58]:

pd.DataFrame(
    rows,
    columns=[
        "Tratamiento",
        "n_instancias_train",
        "Accuracy",
        "F1-macro",
        "tiempo_entreno_s",
    ],
)


Unnamed: 0,Tratamiento,n_instancias_train,Accuracy,F1-macro,tiempo_entreno_s
0,Todo el train,426,0.958042,0.955031,0.004146
1,CNN (condensado),201,0.951049,0.948116,0.002171
2,ENN (edición),408,0.951049,0.948116,0.004675


In [62]:
# OBLIGATORIO: Usa un nuevo dataset de clasificación y repítelo todo
from sklearn.datasets import load_iris
data = load_iris()
X, y = data.data, data.target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=RANDOM_STATE
)

imp = SimpleImputer()
Xtr = imp.fit_transform(X_train)
Xte = imp.transform(X_test)
sc = StandardScaler()
Xtr_s = sc.fit_transform(Xtr)
Xte_s = sc.transform(Xte)

rows = []

clf_full = LogisticRegression(solver="liblinear", random_state=RANDOM_STATE)
t0 = time.perf_counter()
clf_full.fit(Xtr_s, y_train)
t_full = time.perf_counter() - t0
yp_full = clf_full.predict(Xte_s)
acc_full = accuracy_score(y_test, yp_full)
f1_full = f1_score(y_test, yp_full, average="macro")
rows.append(["Todo el train", Xtr_s.shape[0], acc_full, f1_full, t_full])

cnn = CondensedNearestNeighbour(random_state=RANDOM_STATE)
Xtr_cnn, ytr_cnn = cnn.fit_resample(Xtr_s, y_train)
clf_cnn = LogisticRegression(solver="liblinear", random_state=RANDOM_STATE)
t0 = time.perf_counter()
clf_cnn.fit(Xtr_cnn, ytr_cnn)
t_cnn = time.perf_counter() - t0
yp_cnn = clf_cnn.predict(Xte_s)
rows.append(
    [
        "CNN (condensado)",
        Xtr_cnn.shape[0],
        accuracy_score(y_test, yp_cnn),
        f1_score(y_test, yp_cnn, average="macro"),
        t_cnn,
    ]
)

enn = EditedNearestNeighbours()
Xtr_enn, ytr_enn = enn.fit_resample(Xtr_s, y_train)
clf_enn = LogisticRegression(solver="liblinear", random_state=RANDOM_STATE)
t0 = time.perf_counter()
clf_enn.fit(Xtr_enn, ytr_enn)
t_enn = time.perf_counter() - t0
yp_enn = clf_enn.predict(Xte_s)
rows.append(
    [
        "ENN (edición)",
        Xtr_enn.shape[0],
        accuracy_score(y_test, yp_enn),
        f1_score(y_test, yp_enn, average="macro"),
        t_enn,
    ]
)

pd.DataFrame(
    rows,
    columns=[
        "Tratamiento",
        "n_instancias_train",
        "Accuracy",
        "F1-macro",
        "tiempo_entreno_s",
    ],
)

Unnamed: 0,Tratamiento,n_instancias_train,Accuracy,F1-macro,tiempo_entreno_s
0,Todo el train,112,0.842105,0.837662,0.001874
1,CNN (condensado),48,0.657895,0.54955,0.005259
2,ENN (edición),94,0.763158,0.76,0.00273


In [10]:
# OPCIONAL: Usa un dataset de regresión
from sklearn.datasets import load_diabetes
data = load_diabetes()
X, y = data.data, data.target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=RANDOM_STATE
)

imp = SimpleImputer()
Xtr = imp.fit_transform(X_train)
Xte = imp.transform(X_test)
sc = StandardScaler()
Xtr_s = sc.fit_transform(Xtr)
Xte_s = sc.transform(Xte)

rows = []

clf_full = LinearRegression()
t0 = time.perf_counter()
clf_full.fit(Xtr_s, y_train)
t_full = time.perf_counter() - t0
yp_full = clf_full.predict(Xte_s)
acc_full = mean_squared_error(y_test, yp_full)
f1_full = r2_score(y_test, yp_full)
rows.append(["Todo el train", Xtr_s.shape[0], acc_full, f1_full, t_full])

cnn = CondensedNearestNeighbour(random_state=RANDOM_STATE)
Xtr_cnn, ytr_cnn = cnn.fit_resample(Xtr_s, y_train)
clf_cnn = LinearRegression()
t0 = time.perf_counter()
clf_cnn.fit(Xtr_cnn, ytr_cnn)
t_cnn = time.perf_counter() - t0
yp_cnn = clf_cnn.predict(Xte_s)
rows.append(
    [
        "CNN (condensado)",
        Xtr_cnn.shape[0],
        mean_squared_error(y_test, yp_cnn),
        r2_score(y_test, yp_cnn),
        t_cnn,
    ]
)

enn = EditedNearestNeighbours()
Xtr_enn, ytr_enn = enn.fit_resample(Xtr_s, y_train)
clf_enn = LinearRegression()
t0 = time.perf_counter()
clf_enn.fit(Xtr_enn, ytr_enn)
t_enn = time.perf_counter() - t0
yp_enn = clf_enn.predict(Xte_s)
rows.append(
    [
        "ENN (edición)",
        Xtr_enn.shape[0],
        mean_squared_error(y_test, yp_enn),
        r2_score(y_test, yp_enn),
        t_enn,
    ]
)

pd.DataFrame(
    rows,
    columns=[
        "Tratamiento",
        "n_instancias_train",
        "MSE",
        "R2",
        "tiempo_entreno_s",
    ],
)

Unnamed: 0,Tratamiento,n_instancias_train,MSE,R2,tiempo_entreno_s
0,Todo el train,331,3180.159648,0.359409,0.000861
1,CNN (condensado),256,3221.669237,0.351047,0.001714
2,ENN (edición),1,21288.504505,-3.288221,0.000925


### ENTREGABLE: Escribe en el documento de la práctica (formato libre) un texto explicando los resultados obtenidos

## Demostración de Pipeline (sin entregables)
Esto no es necesario para entender los métodos; simplemente muestra cómo encadenar pasos.

In [36]:
from sklearn.pipeline import Pipeline

pipe = Pipeline(
    [
        ("imp", SimpleImputer()),
        ("sc", StandardScaler()),
        (
            "sel",
            SelectFromModel(
                LogisticRegression(
                    penalty="l1", solver="liblinear", random_state=RANDOM_STATE
                )
            ),
        ),
        (
            "clf",
            LogisticRegression(
                penalty="l2", solver="liblinear", random_state=RANDOM_STATE
            ),
        ),
    ]
)
t0 = time.perf_counter()
pipe.fit(X_train, y_train)
t = time.perf_counter() - t0
yp = pipe.predict(X_test)
print(
    f"Pipeline → Accuracy: {accuracy_score(y_test, yp):.4f} | F1-macro: {f1_score(y_test, yp, average='macro'):.4f} | tiempo: {t:.3f}s"
)


Pipeline → Accuracy: 0.9580 | F1-macro: 0.9550 | tiempo: 0.005s
