# Práctica 2

Objetivo: comprender cada técnica de *preparación de datos* y *selección de características*.



**Contenido**
1. Imports y utilidades
2. Línea base (escalado + regresión logística)
3. Imputación (comparación con eliminar filas perdidas)
4. Selección tipo filtro (f\_classif y chi2)
5. RFECV (eliminación recursiva con validación cruzada)
6. SelectFromModel (L1 y Random Forest)
7. Selección de instancias
8. (Opcional) Demostración con `Pipeline`
9. (Opcional) Mini ejemplo de **regresión**

## 1) Imports y utilidades

In [58]:
# 1) Configuración y carga del dataset (clasificación)
import warnings, time
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

# Diferentes datasets de clasificación y regresión que se pueden usar
from sklearn.datasets import load_breast_cancer, fetch_california_housing, load_diabetes

# Algunas utilidades
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.metrics import accuracy_score, f1_score, mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LogisticRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.feature_selection import SelectKBest, f_classif, f_regression, chi2, RFECV, SelectFromModel

RANDOM_STATE = 0
np.random.seed(RANDOM_STATE)

def standardize_train_test(Xtr, Xte):
    sc = StandardScaler()
    return sc.fit_transform(Xtr), sc.transform(Xte)

def simulate_missingness(X, missing_rate=0.05, seed=RANDOM_STATE):
    rng = np.random.RandomState(seed)
    X2 = X.astype(float).copy()
    n, d = X2.shape
    m = int(missing_rate * n * d)
    idx = rng.choice(n*d, m, replace=False)
    X2[idx // d, idx % d] = np.nan
    return X2

# Cargamos un problema de clasificación: Breast Cancer (binaria)
data = load_breast_cancer()
X, y = data.data, data.target
print('X shape:', X.shape, '| y shape:', y.shape)

# Partición train/test estratificada
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=RANDOM_STATE
)
print('Train:', X_train.shape, '| Test:', X_test.shape)

X shape: (569, 30) | y shape: (569,)
Train: (426, 30) | Test: (143, 30)


## 2) Línea base (escalado + clasificador sencillo)
Entrenamos sin selección ni imputación en un dataset sin valores perdidos.

In [59]:
# Escalado
Xtr_s, Xte_s = standardize_train_test(X_train, X_test)

# Clasificador ligero (rápido en aula)
clf_base = LogisticRegression(penalty='l2', solver='liblinear', random_state=RANDOM_STATE)
t0 = time.perf_counter(); clf_base.fit(Xtr_s, y_train); t_base = time.perf_counter() - t0
yp = clf_base.predict(Xte_s)
acc_base = accuracy_score(y_test, yp)
f1_base = f1_score(y_test, yp, average='macro')

print('BASELINE')
print(f'Accuracy: {acc_base:.4f}  |  F1-macro: {f1_base:.4f}  |  tiempo: {t_base:.3f}s')

BASELINE
Accuracy: 0.9580  |  F1-macro: 0.9550  |  tiempo: 0.003s


## 3) Imputación (comparación con eliminar filas perdidas)
Simulamos un **5%** de valores perdidos y comparamos:
- **Eliminar filas con NaN** (train y test por separado)
- **Imputación simple** (media)
- **Imputación KNN** (k=5)

In [60]:
# Simulamos valores perdidos
Xtr_m = simulate_missingness(X_train, 0.05)
Xte_m = simulate_missingness(X_test, 0.05)
res_imput = []

In [47]:
# A) Eliminar filas con NaN (cuidado: reducimos datos)
mask_tr = ~np.isnan(Xtr_m).any(axis=1)
mask_te = ~np.isnan(Xte_m).any(axis=1)
Xtr_drop, ytr_drop = Xtr_m[mask_tr], y_train[mask_tr]
Xte_drop, yte_drop = Xte_m[mask_te], y_test[mask_te]
Xtr_s, Xte_s = standardize_train_test(Xtr_drop, Xte_drop)
clf = LogisticRegression(solver="liblinear", random_state=RANDOM_STATE)
t0 = time.perf_counter()
clf.fit(Xtr_s, ytr_drop)
t = time.perf_counter() - t0
yp = clf.predict(Xte_s)
res_imput.append(
    [
        "Eliminar filas",
        Xtr_drop.shape[0],
        Xte_drop.shape[0],
        accuracy_score(yte_drop, yp),
        f1_score(yte_drop, yp, average="macro"),
        t,
    ]
)

In [48]:
# B) SimpleImputer (media)
imp = SimpleImputer(strategy="mean")
Xtr_imp = imp.fit_transform(Xtr_m)
Xte_imp = imp.transform(Xte_m)
Xtr_s, Xte_s = standardize_train_test(Xtr_imp, Xte_imp)
clf = LogisticRegression(solver="liblinear", random_state=RANDOM_STATE)
t0 = time.perf_counter()
clf.fit(Xtr_s, y_train)
t = time.perf_counter() - t0
yp = clf.predict(Xte_s)
res_imput.append(
    [
        "Imputación: media",
        Xtr_imp.shape[0],
        Xte_imp.shape[0],
        accuracy_score(y_test, yp),
        f1_score(y_test, yp, average="macro"),
        t,
    ]
)

In [49]:
# C) KNNImputer (k=5)
imp = KNNImputer(n_neighbors=5)
Xtr_imp = imp.fit_transform(Xtr_m)
Xte_imp = imp.transform(Xte_m)
Xtr_s, Xte_s = standardize_train_test(Xtr_imp, Xte_imp)
clf = LogisticRegression(solver="liblinear", random_state=RANDOM_STATE)
t0 = time.perf_counter()
clf.fit(Xtr_s, y_train)
t = time.perf_counter() - t0
yp = clf.predict(Xte_s)
res_imput.append(
    [
        "Imputación: KNN (k=5)",
        Xtr_imp.shape[0],
        Xte_imp.shape[0],
        accuracy_score(y_test, yp),
        f1_score(y_test, yp, average="macro"),
        t,
    ]
)


In [50]:
# OBLIGATORIO: Añade otro método de imputación básico
# Utilizamos imputación con la mediana
imp = SimpleImputer(strategy="median")
Xtr_imp = imp.fit_transform(Xtr_m)
Xte_imp = imp.transform(Xte_m)
Xtr_s, Xte_s = standardize_train_test(Xtr_imp, Xte_imp)
clf = LogisticRegression(solver="liblinear", random_state=RANDOM_STATE)
t0 = time.perf_counter()
clf.fit(Xtr_s, y_train)
t = time.perf_counter() - t0
yp = clf.predict(Xte_s)
res_imput.append(
    [
        "Imputación: mediana",
        Xtr_imp.shape[0],
        Xte_imp.shape[0],
        accuracy_score(y_test, yp),
        f1_score(y_test, yp, average="macro"),
        t,
    ]
)

In [51]:
# OBLIGATORIO: Añade otro método de imputación avanzado
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.svm import SVR

imp = IterativeImputer(
    estimator=SVR(kernel="rbf", C=10, epsilon=0.1, gamma="scale"),
    initial_strategy="median",
    max_iter=10,
    random_state=0,
)

Xtr_imp = imp.fit_transform(Xtr_m)
Xte_imp = imp.transform(Xte_m)
Xtr_s, Xte_s = standardize_train_test(Xtr_imp, Xte_imp)
clf = LogisticRegression(solver="liblinear", random_state=RANDOM_STATE)
t0 = time.perf_counter()
clf.fit(Xtr_s, y_train)
t = time.perf_counter() - t0
yp = clf.predict(Xte_s)
res_imput.append(
    [
        "Imputación: SVR",
        Xtr_imp.shape[0],
        Xte_imp.shape[0],
        accuracy_score(y_test, yp),
        f1_score(y_test, yp, average="macro"),
        t,
    ]
)

In [None]:
for fila in res_imput:
    print (fila)

['Eliminar filas', 90, 33, 0.9393939393939394, 0.9379699248120301, 0.001662900000155787]
['Imputación: media', 426, 143, 0.965034965034965, 0.9623783214943435, 0.0033559999992576195]
['Imputación: KNN (k=5)', 426, 143, 0.958041958041958, 0.9550314465408805, 0.0032467999999425956]
['Imputación: mediana', 426, 143, 0.958041958041958, 0.9550314465408805, 0.0033247000001210836]
['Imputación: SVR', 426, 143, 0.9370629370629371, 0.9322809786898185, 0.003037899999981164]


In [None]:
# OPCIONAL: Usa un dataset de regresión, repite todos los métodos de imputación y muestra los resultados.
# No olvides adaptar todas las métricas: Accuracy/F1 solo sirven para problemas de clasificación
from sklearn.linear_model import LinearRegression

data2 = fetch_california_housing()
X, y = data2.data, data2.target
print('X shape:', X.shape, '| y shape:', y.shape)

# Partición train/test estratificada
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=RANDOM_STATE
)

Xtr_m = simulate_missingness(X_train, 0.05)
Xte_m = simulate_missingness(X_test, 0.05)
res_imput = []

# B) SimpleImputer (media)
imp = SimpleImputer(strategy="mean")
Xtr_imp = imp.fit_transform(Xtr_m)
Xte_imp = imp.transform(Xte_m)
Xtr_s, Xte_s = standardize_train_test(Xtr_imp, Xte_imp)
clf = LinearRegression()
t0 = time.perf_counter()
clf.fit(Xtr_s, y_train)
t = time.perf_counter() - t0
yp = clf.predict(Xte_s)
res_imput.append(
    [
        "Imputación: media",
        Xtr_imp.shape[0],
        Xte_imp.shape[0],
        mean_squared_error(y_test, yp),
        r2_score(y_test, yp),
        t,
    ]
)

# C) KNNImputer (k=5)
imp = KNNImputer(n_neighbors=5)
Xtr_imp = imp.fit_transform(Xtr_m)
Xte_imp = imp.transform(Xte_m)
Xtr_s, Xte_s = standardize_train_test(Xtr_imp, Xte_imp)
clf = LinearRegression()
t0 = time.perf_counter()
clf.fit(Xtr_s, y_train)
t = time.perf_counter() - t0
yp = clf.predict(Xte_s)
res_imput.append(
    [
        "Imputación: KNN (k=5)",
        Xtr_imp.shape[0],
        Xte_imp.shape[0],
        mean_squared_error(y_test, yp),
        r2_score(y_test, yp),
        t,
    ]
)

# OBLIGATORIO: Añade otro método de imputación básico
# Utilizamos imputación con la mediana
imp = SimpleImputer(strategy="median")
Xtr_imp = imp.fit_transform(Xtr_m)
Xte_imp = imp.transform(Xte_m)
Xtr_s, Xte_s = standardize_train_test(Xtr_imp, Xte_imp)
clf = LinearRegression()
t0 = time.perf_counter()
clf.fit(Xtr_s, y_train)
t = time.perf_counter() - t0
yp = clf.predict(Xte_s)
res_imput.append(
    [
        "Imputación: mediana",
        Xtr_imp.shape[0],
        Xte_imp.shape[0],
        mean_squared_error(y_test, yp),
        r2_score(y_test, yp),
        t,
    ]
)

# OBLIGATORIO: Añade otro método de imputación avanzado
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.svm import SVR

imp = IterativeImputer(
    estimator=SVR(kernel="rbf", C=10, epsilon=0.1, gamma="scale"),
    initial_strategy="median",
    max_iter=10,
    random_state=0,
)

Xtr_imp = imp.fit_transform(Xtr_m)
Xte_imp = imp.transform(Xte_m)
Xtr_s, Xte_s = standardize_train_test(Xtr_imp, Xte_imp)
clf = LinearRegression()
t0 = time.perf_counter()
clf.fit(Xtr_s, y_train)
t = time.perf_counter() - t0
yp = clf.predict(Xte_s)
res_imput.append(
    [
        "Imputación: SVR",
        Xtr_imp.shape[0],
        Xte_imp.shape[0],
        mean_squared_error(y_test, yp),
        r2_score(y_test, yp),
        t,
    ]
)

for fila in res_imput:
    print (fila)

X shape: (20640, 8) | y shape: (20640,)
['Imputación: media', 15480, 5160, 0.6320492952950263, 0.5218451545074996, 0.0030262000000220723]
['Imputación: KNN (k=5)', 15480, 5160, 0.5752163798324154, 0.5648401141003718, 0.001670199999352917]
['Imputación: mediana', 15480, 5160, 0.6481031351028333, 0.5097001820346163, 0.002002600000196253]
['Imputación: SVR', 15480, 5160, 0.6351649509503047, 0.5194881139102752, 0.0018387000000075204]


### ENTREGABLE: Escribe en el documento de la práctica (formato libre) un texto explicando los resultados obtenidos y decidiendo cuál es el mejor método de imputación en cada caso

In [54]:
# Mostramos resultados
df_imput = pd.DataFrame(res_imput, columns=['Tratamiento', 'n_train', 'n_test', 'Accuracy', 'F1-macro', 'tiempo_entreno_s'])
df_imput

Unnamed: 0,Tratamiento,n_train,n_test,Accuracy,F1-macro,tiempo_entreno_s
0,Imputación: media,15480,5160,0.632049,0.521845,0.003026
1,Imputación: KNN (k=5),15480,5160,0.575216,0.56484,0.00167
2,Imputación: mediana,15480,5160,0.648103,0.5097,0.002003
3,Imputación: SVR,15480,5160,0.635165,0.519488,0.001839


## 4) Selección tipo filtro
Comparamos *sin selección* vs **SelectKBest** con:
- `f_classif` (general)
- `chi2` (requiere no-negatividad, debemos aplicar `MinMaxScaler` antes de usarlo)

In [67]:
# Usamos los datos SIN NaN (X_train / X_test originales)
imp = SimpleImputer()  # por seguridad
Xtr = imp.fit_transform(X_train)
Xte = imp.transform(X_test)
Xtr_s0, Xte_s0 = standardize_train_test(Xtr, Xte)

# Baseline
clf = LogisticRegression(solver="liblinear", random_state=RANDOM_STATE)
t0 = time.perf_counter()
clf.fit(Xtr_s0, y_train)
t_base2 = time.perf_counter() - t0
yp = clf.predict(Xte_s0)
acc0 = accuracy_score(y_test, yp)
f10 = f1_score(y_test, yp, average="macro")

rows = [["Sin selección", Xtr.shape[1], acc0, f10, t_base2]]


In [68]:
# SelectKBest f_classif (k=10)
k = min(10, Xtr.shape[1])
sel = SelectKBest(score_func=f_classif, k=k)
Xtr_k = sel.fit_transform(Xtr_s0, y_train)
Xte_k = sel.transform(Xte_s0)
clf = LogisticRegression(solver="liblinear", random_state=RANDOM_STATE)
t0 = time.perf_counter()
clf.fit(Xtr_k, y_train)
t1 = time.perf_counter() - t0
yp = clf.predict(Xte_k)
rows.append(
    [
        f"SelectKBest f_classif (k={k})",
        k,
        accuracy_score(y_test, yp),
        f1_score(y_test, yp, average="macro"),
        t1,
    ]
)


In [69]:
# SelectKBest chi2 (k=10) → MinMax
from sklearn.preprocessing import MinMaxScaler
mm = MinMaxScaler()
Xtr_mm = mm.fit_transform(Xtr)
Xte_mm = mm.transform(Xte)
sel = SelectKBest(score_func=chi2, k=k)
Xtr_k2 = sel.fit_transform(Xtr_mm, y_train)
Xte_k2 = sel.transform(Xte_mm)
clf = LogisticRegression(solver='liblinear', random_state=RANDOM_STATE)
t0 = time.perf_counter(); clf.fit(Xtr_k2, y_train); t2 = time.perf_counter() - t0
yp = clf.predict(Xte_k2)
rows.append([f'SelectKBest chi2 (k={k})', k, accuracy_score(y_test, yp), f1_score(y_test, yp, average='macro'), t2])



In [None]:
# OBLIGATORIO: Prueba con diferentes valores de n_features
k = min(5, Xtr.shape[1])
sel = SelectKBest(score_func=f_classif, k=k)
Xtr_k = sel.fit_transform(Xtr_s0, y_train)
Xte_k = sel.transform(Xte_s0)
clf = LogisticRegression(solver="liblinear", random_state=RANDOM_STATE)
t0 = time.perf_counter()
clf.fit(Xtr_k, y_train)
t1 = time.perf_counter() - t0
yp = clf.predict(Xte_k)
rows.append(
    [
        f"SelectKBest f_classif (k={k})",
        k,
        accuracy_score(y_test, yp),
        f1_score(y_test, yp, average="macro"),
        t1,
    ]
)

Xtr_mm = mm.fit_transform(Xtr)
Xte_mm = mm.transform(Xte)
sel = SelectKBest(score_func=chi2, k=k)
Xtr_k2 = sel.fit_transform(Xtr_mm, y_train)
Xte_k2 = sel.transform(Xte_mm)
clf = LogisticRegression(solver='liblinear', random_state=RANDOM_STATE)
t0 = time.perf_counter(); clf.fit(Xtr_k2, y_train); t2 = time.perf_counter() - t0
yp = clf.predict(Xte_k2)
rows.append([f'SelectKBest chi2 (k={k})', k, accuracy_score(y_test, yp), f1_score(y_test, yp, average='macro'), t2])

k = min(20, Xtr.shape[1])
sel = SelectKBest(score_func=f_classif, k=k)
Xtr_k = sel.fit_transform(Xtr_s0, y_train)
Xte_k = sel.transform(Xte_s0)
clf = LogisticRegression(solver="liblinear", random_state=RANDOM_STATE)
t0 = time.perf_counter()
clf.fit(Xtr_k, y_train)
t1 = time.perf_counter() - t0
yp = clf.predict(Xte_k)
rows.append(
    [
        f"SelectKBest f_classif (k={k})",
        k,
        accuracy_score(y_test, yp),
        f1_score(y_test, yp, average="macro"),
        t1,
    ]
)

Xtr_mm = mm.fit_transform(Xtr)
Xte_mm = mm.transform(Xte)
sel = SelectKBest(score_func=chi2, k=k)
Xtr_k2 = sel.fit_transform(Xtr_mm, y_train)
Xte_k2 = sel.transform(Xte_mm)
clf = LogisticRegression(solver='liblinear', random_state=RANDOM_STATE)
t0 = time.perf_counter(); clf.fit(Xtr_k2, y_train); t2 = time.perf_counter() - t0
yp = clf.predict(Xte_k2)
rows.append([f'SelectKBest chi2 (k={k})', k, accuracy_score(y_test, yp), f1_score(y_test, yp, average='macro'), t2])

for fila in rows:
    print (fila)

['Sin selección', 30, 0.958041958041958, 0.9550314465408805, 0.003491500000563974]
['SelectKBest f_classif (k=10)', 10, 0.951048951048951, 0.947329650092081, 0.002073200000268116]
['SelectKBest chi2 (k=10)', 10, 0.9300699300699301, 0.9244505494505495, 0.001654200000302808]
['SelectKBest f_classif (k=5)', 5, 0.9300699300699301, 0.9250524109014675, 0.001465999999709311]
['SelectKBest chi2 (k=5)', 5, 0.9300699300699301, 0.9244505494505495, 0.0022325999998429324]
['SelectKBest f_classif (k=20)', 20, 0.965034965034965, 0.9620669531540135, 0.0023682999999437016]
['SelectKBest chi2 (k=20)', 20, 0.9440559440559441, 0.9390451832907076, 0.003485000000182481]


In [73]:
# OBLIGATORIO: Usa un segundo problema de clasificación y repite todo
# No escoger digits -> Ejemplo para usar "Wines" (Hay que buscar)
from sklearn.datasets import load_wine
data3 = load_wine()
X, y = data3.data, data3.target

# Partición train/test estratificada
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=RANDOM_STATE
)

Xtr_m = simulate_missingness(X_train, 0.05)
Xte_m = simulate_missingness(X_test, 0.05)
res_imput = []

imp = SimpleImputer()  # por seguridad
Xtr = imp.fit_transform(X_train)
Xte = imp.transform(X_test)
Xtr_s0, Xte_s0 = standardize_train_test(Xtr, Xte)

# Baseline
clf = LogisticRegression(solver="liblinear", random_state=RANDOM_STATE)
t0 = time.perf_counter()
clf.fit(Xtr_s0, y_train)
t_base2 = time.perf_counter() - t0
yp = clf.predict(Xte_s0)
acc0 = accuracy_score(y_test, yp)
f10 = f1_score(y_test, yp, average="macro")

rows = [["Sin selección", Xtr.shape[1], acc0, f10, t_base2]]


k = min(10, Xtr.shape[1])
sel = SelectKBest(score_func=f_classif, k=k)
Xtr_k = sel.fit_transform(Xtr_s0, y_train)
Xte_k = sel.transform(Xte_s0)
clf = LogisticRegression(solver="liblinear", random_state=RANDOM_STATE)
t0 = time.perf_counter()
clf.fit(Xtr_k, y_train)
t1 = time.perf_counter() - t0
yp = clf.predict(Xte_k)
rows.append(
    [
        f"SelectKBest f_classif Wines (k={k})",
        k,
        accuracy_score(y_test, yp),
        f1_score(y_test, yp, average="macro"),
        t1,
    ]
)

mm = MinMaxScaler()
Xtr_mm = mm.fit_transform(Xtr)
Xte_mm = mm.transform(Xte)
sel = SelectKBest(score_func=chi2, k=k)
Xtr_k2 = sel.fit_transform(Xtr_mm, y_train)
Xte_k2 = sel.transform(Xte_mm)
clf = LogisticRegression(solver='liblinear', random_state=RANDOM_STATE)
t0 = time.perf_counter(); clf.fit(Xtr_k2, y_train); t2 = time.perf_counter() - t0
yp = clf.predict(Xte_k2)
rows.append([f'SelectKBest chi2 Wines (k={k})', k, accuracy_score(y_test, yp), f1_score(y_test, yp, average='macro'), t2])

k = min(5, Xtr.shape[1])
sel = SelectKBest(score_func=f_classif, k=k)
Xtr_k = sel.fit_transform(Xtr_s0, y_train)
Xte_k = sel.transform(Xte_s0)
clf = LogisticRegression(solver="liblinear", random_state=RANDOM_STATE)
t0 = time.perf_counter()
clf.fit(Xtr_k, y_train)
t1 = time.perf_counter() - t0
yp = clf.predict(Xte_k)
rows.append(
    [
        f"SelectKBest f_classif Wines (k={k})",
        k,
        accuracy_score(y_test, yp),
        f1_score(y_test, yp, average="macro"),
        t1,
    ]
)

Xtr_mm = mm.fit_transform(Xtr)
Xte_mm = mm.transform(Xte)
sel = SelectKBest(score_func=chi2, k=k)
Xtr_k2 = sel.fit_transform(Xtr_mm, y_train)
Xte_k2 = sel.transform(Xte_mm)
clf = LogisticRegression(solver='liblinear', random_state=RANDOM_STATE)
t0 = time.perf_counter(); clf.fit(Xtr_k2, y_train); t2 = time.perf_counter() - t0
yp = clf.predict(Xte_k2)
rows.append([f'SelectKBest chi2 Wines (k={k})', k, accuracy_score(y_test, yp), f1_score(y_test, yp, average='macro'), t2])

k = min(20, Xtr.shape[1])
sel = SelectKBest(score_func=f_classif, k=k)
Xtr_k = sel.fit_transform(Xtr_s0, y_train)
Xte_k = sel.transform(Xte_s0)
clf = LogisticRegression(solver="liblinear", random_state=RANDOM_STATE)
t0 = time.perf_counter()
clf.fit(Xtr_k, y_train)
t1 = time.perf_counter() - t0
yp = clf.predict(Xte_k)
rows.append(
    [
        f"SelectKBest f_classif Wines(k={k})",
        k,
        accuracy_score(y_test, yp),
        f1_score(y_test, yp, average="macro"),
        t1,
    ]
)

Xtr_mm = mm.fit_transform(Xtr)
Xte_mm = mm.transform(Xte)
sel = SelectKBest(score_func=chi2, k=k)
Xtr_k2 = sel.fit_transform(Xtr_mm, y_train)
Xte_k2 = sel.transform(Xte_mm)
clf = LogisticRegression(solver='liblinear', random_state=RANDOM_STATE)
t0 = time.perf_counter(); clf.fit(Xtr_k2, y_train); t2 = time.perf_counter() - t0
yp = clf.predict(Xte_k2)
rows.append([f'SelectKBest chi2 Wines (k={k})', k, accuracy_score(y_test, yp), f1_score(y_test, yp, average='macro'), t2])


In [None]:
# OPCIONAL: Usa un problema de regresión, adapta lo necesario y repite todo


In [74]:
pd.DataFrame(
    rows,
    columns=["Tratamiento", "n_features", "Accuracy", "F1-macro", "tiempo_entreno_s"],
)


Unnamed: 0,Tratamiento,n_features,Accuracy,F1-macro,tiempo_entreno_s
0,Sin selección,13,1.0,1.0,0.001855
1,SelectKBest f_classif Wines (k=10),10,1.0,1.0,0.001657
2,SelectKBest chi2 Wines (k=10),10,0.977778,0.972262,0.002043
3,SelectKBest f_classif Wines (k=5),5,0.977778,0.972262,0.001288
4,SelectKBest chi2 Wines (k=5),5,0.911111,0.908267,0.001175
5,SelectKBest f_classif Wines(k=13),13,1.0,1.0,0.00163
6,SelectKBest chi2 Wines (k=13),13,0.977778,0.972262,0.001445


### ENTREGABLE: Escribe en el documento de la práctica (formato libre) un texto explicando los resultados obtenidos y decidiendo cuál es el mejor número de características en cada caso

## 5) RFECV
Usamos **RFECV** para encontrar automáticamente cuántas características dejar. Después reentrenamos una RL con esas características.

In [76]:
data = load_breast_cancer()
X, y = data.data, data.target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=RANDOM_STATE
)

Xtr_s, Xte_s = standardize_train_test(X_train, X_test)

imp = SimpleImputer(); Xtr = imp.fit_transform(X_train); Xte = imp.transform(X_test)
Xtr_s, Xte_s = standardize_train_test(Xtr, Xte)

est = LogisticRegression(solver='liblinear', random_state=RANDOM_STATE)
rfecv = RFECV(estimator=est, step=2, cv=5, scoring='f1_macro', n_jobs=-1)
t0 = time.perf_counter(); rfecv.fit(Xtr_s, y_train); t_sel = time.perf_counter() - t0
nsel = int(getattr(rfecv, 'n_features_', Xtr.shape[1]))

Xtr_sel = rfecv.transform(Xtr_s); Xte_sel = rfecv.transform(Xte_s)
final = LogisticRegression(solver='liblinear', random_state=RANDOM_STATE)
t0 = time.perf_counter(); final.fit(Xtr_sel, y_train); t_fit = time.perf_counter() - t0
yp = final.predict(Xte_sel)

print('RFECV')
print('n_features seleccionadas:', nsel)
print(f'Accuracy: {accuracy_score(y_test, yp):.4f}  |  F1-macro: {f1_score(y_test, yp, average="macro"):.4f}  |  tiempo_total: {t_sel + t_fit:.3f}s')

RFECV
n_features seleccionadas: 6
Accuracy: 0.9441  |  F1-macro: 0.9400  |  tiempo_total: 3.012s


In [None]:
# OBLIGATORIO: Compara RFECV con filtro en las mismas condiciones (mismo dataset, misma imputación/escalado, mismo número de características) y explica cuál es mejor


In [20]:
# OBLIGATORIO: Eligen RFECV y filtro las mismas variables?

In [21]:
# OPCIONAL: Repite todo para un problema de regresión

### ENTREGABLE: Escribe en el documento de la práctica (formato libre) un texto explicando los resultados obtenidos

## 6) SelectFromModel
Primero seleccionamos características y luego reentrenamos para comparar solo el efecto de la selección.

In [22]:
imp = SimpleImputer()
Xtr = imp.fit_transform(X_train)
Xte = imp.transform(X_test)
Xtr_s, Xte_s = standardize_train_test(Xtr, Xte)


In [23]:
rows = []
# L1 como selector
sel1 = SelectFromModel(
    LogisticRegression(penalty="l1", solver="liblinear", random_state=RANDOM_STATE)
)
t0 = time.perf_counter()
sel1.fit(Xtr_s, y_train)
t_sel1 = time.perf_counter() - t0
Xtr_sel = sel1.transform(Xtr_s)
Xte_sel = sel1.transform(Xte_s)
clf = LogisticRegression(penalty="l2", solver="liblinear", random_state=RANDOM_STATE)
t0 = time.perf_counter()
clf.fit(Xtr_sel, y_train)
t_fit1 = time.perf_counter() - t0
yp = clf.predict(Xte_sel)
rows.append(
    [
        "SFM(L1 LR) + LR L2",
        int(sel1.get_support().sum()),
        accuracy_score(y_test, yp),
        f1_score(y_test, yp, average="macro"),
        t_sel1 + t_fit1,
    ]
)


In [24]:
# RandomForest como selector
sel2 = SelectFromModel(
    RandomForestClassifier(n_estimators=200, random_state=RANDOM_STATE, n_jobs=-1)
)
t0 = time.perf_counter()
sel2.fit(Xtr_s, y_train)
t_sel2 = time.perf_counter() - t0
Xtr_sel = sel2.transform(Xtr_s)
Xte_sel = sel2.transform(Xte_s)
clf = LogisticRegression(penalty="l2", solver="liblinear", random_state=RANDOM_STATE)
t0 = time.perf_counter()
clf.fit(Xtr_sel, y_train)
t_fit2 = time.perf_counter() - t0
yp = clf.predict(Xte_sel)
rows.append(
    [
        "SFM(RandomForest) + LR L2",
        int(sel2.get_support().sum()),
        accuracy_score(y_test, yp),
        f1_score(y_test, yp, average="macro"),
        t_sel2 + t_fit2,
    ]
)


In [25]:
pd.DataFrame(rows, columns=['Tratamiento', 'n_features', 'Accuracy', 'F1-macro', 'tiempo_total_s'])

Unnamed: 0,Tratamiento,n_features,Accuracy,F1-macro,tiempo_total_s
0,SFM(L1 LR) + LR L2,14,0.958042,0.955031,0.002982
1,SFM(RandomForest) + LR L2,9,0.951049,0.94733,0.14106


In [None]:
# OBLIGATORIO: Compara los tres métodos de selección de características en las mismas condiciones 
# (usando el mismo dataset, misma imputación/escalado, mismo número de características) y explica cuál es mejor


In [27]:
# OBLIGATORIO: Elige este método las mismas variables que los anteriores? 

In [28]:
# OPCIONAL: Repite todo para un problema de regresión


### ENTREGABLE: Escribe en el documento de la práctica (formato libre) un texto explicando los resultados obtenidos

## 7) Selección de instancias
Reducimos deliberadamente el tamaño del conjunto de entrenamiento y comparamos con entrenar con todo el train.



- **CNN** (Condensed Nearest Neighbour): condensa el train manteniendo representantes.
- **ENN** (Edited Nearest Neighbours): elimina ejemplos conflictivos.

> Requiere imbalanced-learn: pip install imbalanced-learn.

In [29]:
from imblearn.under_sampling import CondensedNearestNeighbour, EditedNearestNeighbours

# Preprocesado (imputación + escalado con train)
imp = SimpleImputer()
Xtr = imp.fit_transform(X_train)
Xte = imp.transform(X_test)
sc = StandardScaler()
Xtr_s = sc.fit_transform(Xtr)
Xte_s = sc.transform(Xte)

rows = []


In [30]:
# A) Todo el train
clf_full = LogisticRegression(solver="liblinear", random_state=RANDOM_STATE)
t0 = time.perf_counter()
clf_full.fit(Xtr_s, y_train)
t_full = time.perf_counter() - t0
yp_full = clf_full.predict(Xte_s)
acc_full = accuracy_score(y_test, yp_full)
f1_full = f1_score(y_test, yp_full, average="macro")
rows.append(["Todo el train", Xtr_s.shape[0], acc_full, f1_full, t_full])


In [31]:
# B) CNN (condensado)
cnn = CondensedNearestNeighbour(random_state=RANDOM_STATE)
Xtr_cnn, ytr_cnn = cnn.fit_resample(Xtr_s, y_train)
clf_cnn = LogisticRegression(solver="liblinear", random_state=RANDOM_STATE)
t0 = time.perf_counter()
clf_cnn.fit(Xtr_cnn, ytr_cnn)
t_cnn = time.perf_counter() - t0
yp_cnn = clf_cnn.predict(Xte_s)
rows.append(
    [
        "CNN (condensado)",
        Xtr_cnn.shape[0],
        accuracy_score(y_test, yp_cnn),
        f1_score(y_test, yp_cnn, average="macro"),
        t_cnn,
    ]
)

In [32]:
# C) ENN (edición)
enn = EditedNearestNeighbours()
Xtr_enn, ytr_enn = enn.fit_resample(Xtr_s, y_train)
clf_enn = LogisticRegression(solver="liblinear", random_state=RANDOM_STATE)
t0 = time.perf_counter()
clf_enn.fit(Xtr_enn, ytr_enn)
t_enn = time.perf_counter() - t0
yp_enn = clf_enn.predict(Xte_s)
rows.append(
    [
        "ENN (edición)",
        Xtr_enn.shape[0],
        accuracy_score(y_test, yp_enn),
        f1_score(y_test, yp_enn, average="macro"),
        t_enn,
    ]
)


In [33]:

pd.DataFrame(
    rows,
    columns=[
        "Tratamiento",
        "n_instancias_train",
        "Accuracy",
        "F1-macro",
        "tiempo_entreno_s",
    ],
)


Unnamed: 0,Tratamiento,n_instancias_train,Accuracy,F1-macro,tiempo_entreno_s
0,Todo el train,426,0.958042,0.955031,0.002005
1,CNN (condensado),201,0.951049,0.948116,0.000971
2,ENN (edición),408,0.951049,0.948116,0.002074


In [34]:
# OBLIGATORIO: Usa un nuevo dataset de clasificación y repítelo todo

In [35]:
# OPCIONAL: Usa un dataset de regresión

### ENTREGABLE: Escribe en el documento de la práctica (formato libre) un texto explicando los resultados obtenidos

## Demostración de Pipeline (sin entregables)
Esto no es necesario para entender los métodos; simplemente muestra cómo encadenar pasos.

In [36]:
from sklearn.pipeline import Pipeline

pipe = Pipeline(
    [
        ("imp", SimpleImputer()),
        ("sc", StandardScaler()),
        (
            "sel",
            SelectFromModel(
                LogisticRegression(
                    penalty="l1", solver="liblinear", random_state=RANDOM_STATE
                )
            ),
        ),
        (
            "clf",
            LogisticRegression(
                penalty="l2", solver="liblinear", random_state=RANDOM_STATE
            ),
        ),
    ]
)
t0 = time.perf_counter()
pipe.fit(X_train, y_train)
t = time.perf_counter() - t0
yp = pipe.predict(X_test)
print(
    f"Pipeline → Accuracy: {accuracy_score(y_test, yp):.4f} | F1-macro: {f1_score(y_test, yp, average='macro'):.4f} | tiempo: {t:.3f}s"
)


Pipeline → Accuracy: 0.9580 | F1-macro: 0.9550 | tiempo: 0.005s
