# Práctica 2

Objetivo: comprender cada técnica de *preparación de datos* y *selección de características*.



**Contenido**
1. Imports y utilidades
2. Línea base (escalado + regresión logística)
3. Imputación (comparación con eliminar filas perdidas)
4. Selección tipo filtro (f\_classif y chi2)
5. RFECV (eliminación recursiva con validación cruzada)
6. SelectFromModel (L1 y Random Forest)
7. Selección de instancias
8. (Opcional) Demostración con `Pipeline`
9. (Opcional) Mini ejemplo de **regresión**

## 1) Imports y utilidades

In [43]:
# 1) Configuración y carga del dataset (clasificación)
import warnings, time
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

# Diferentes datasets de clasificación y regresión que se pueden usar
from sklearn.datasets import load_breast_cancer, fetch_california_housing, load_diabetes

# Algunas utilidades
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.metrics import accuracy_score, f1_score, mean_squared_error, mean_absolute_error
from sklearn.linear_model import LogisticRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.feature_selection import SelectKBest, f_classif, f_regression, chi2, RFECV, SelectFromModel

RANDOM_STATE = 0
np.random.seed(RANDOM_STATE)

def standardize_train_test(Xtr, Xte):
    sc = StandardScaler()
    return sc.fit_transform(Xtr), sc.transform(Xte)

def simulate_missingness(X, missing_rate=0.05, seed=RANDOM_STATE):
    rng = np.random.RandomState(seed)
    X2 = X.astype(float).copy()
    n, d = X2.shape
    m = int(missing_rate * n * d)
    idx = rng.choice(n*d, m, replace=False)
    X2[idx // d, idx % d] = np.nan
    return X2

# Cargamos un problema de clasificación: Breast Cancer (binaria)
data = load_breast_cancer()
X, y = data.data, data.target
print('X shape:', X.shape, '| y shape:', y.shape)

# Partición train/test estratificada
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=RANDOM_STATE
)
print('Train:', X_train.shape, '| Test:', X_test.shape)

X shape: (569, 30) | y shape: (569,)
Train: (426, 30) | Test: (143, 30)


## 2) Línea base (escalado + clasificador sencillo)
Entrenamos sin selección ni imputación en un dataset sin valores perdidos.

In [44]:
# Escalado
Xtr_s, Xte_s = standardize_train_test(X_train, X_test)

# Clasificador ligero (rápido en aula)
clf_base = LogisticRegression(penalty='l2', solver='liblinear', random_state=RANDOM_STATE)
t0 = time.perf_counter(); clf_base.fit(Xtr_s, y_train); t_base = time.perf_counter() - t0
yp = clf_base.predict(Xte_s)
acc_base = accuracy_score(y_test, yp)
f1_base = f1_score(y_test, yp, average='macro')

print('BASELINE')
print(f'Accuracy: {acc_base:.4f}  |  F1-macro: {f1_base:.4f}  |  tiempo: {t_base:.3f}s')

BASELINE
Accuracy: 0.9580  |  F1-macro: 0.9550  |  tiempo: 0.002s


## 3) Imputación (comparación con eliminar filas perdidas)
Simulamos un **5%** de valores perdidos y comparamos:
- **Eliminar filas con NaN** (train y test por separado)
- **Imputación simple** (media)
- **Imputación KNN** (k=5)

In [45]:
# Simulamos valores perdidos
Xtr_m = simulate_missingness(X_train, 0.05)
Xte_m = simulate_missingness(X_test, 0.05)
res_imput = []

In [46]:
# A) Eliminar filas con NaN (cuidado: reducimos datos)
mask_tr = ~np.isnan(Xtr_m).any(axis=1)
mask_te = ~np.isnan(Xte_m).any(axis=1)
Xtr_drop, ytr_drop = Xtr_m[mask_tr], y_train[mask_tr]
Xte_drop, yte_drop = Xte_m[mask_te], y_test[mask_te]
Xtr_s, Xte_s = standardize_train_test(Xtr_drop, Xte_drop)
clf = LogisticRegression(solver="liblinear", random_state=RANDOM_STATE)
t0 = time.perf_counter()
clf.fit(Xtr_s, ytr_drop)
t = time.perf_counter() - t0
yp = clf.predict(Xte_s)
res_imput.append(
    [
        "Eliminar filas",
        Xtr_drop.shape[0],
        Xte_drop.shape[0],
        accuracy_score(yte_drop, yp),
        f1_score(yte_drop, yp, average="macro"),
        t,
    ]
)

In [47]:
# B) SimpleImputer (media)
imp = SimpleImputer(strategy="mean")
Xtr_imp = imp.fit_transform(Xtr_m)
Xte_imp = imp.transform(Xte_m)
Xtr_s, Xte_s = standardize_train_test(Xtr_imp, Xte_imp)
clf = LogisticRegression(solver="liblinear", random_state=RANDOM_STATE)
t0 = time.perf_counter()
clf.fit(Xtr_s, y_train)
t = time.perf_counter() - t0
yp = clf.predict(Xte_s)
res_imput.append(
    [
        "Imputación: media",
        Xtr_imp.shape[0],
        Xte_imp.shape[0],
        accuracy_score(y_test, yp),
        f1_score(y_test, yp, average="macro"),
        t,
    ]
)

In [48]:
# C) KNNImputer (k=5)
imp = KNNImputer(n_neighbors=5)
Xtr_imp = imp.fit_transform(Xtr_m)
Xte_imp = imp.transform(Xte_m)
Xtr_s, Xte_s = standardize_train_test(Xtr_imp, Xte_imp)
clf = LogisticRegression(solver="liblinear", random_state=RANDOM_STATE)
t0 = time.perf_counter()
clf.fit(Xtr_s, y_train)
t = time.perf_counter() - t0
yp = clf.predict(Xte_s)
res_imput.append(
    [
        "Imputación: KNN (k=5)",
        Xtr_imp.shape[0],
        Xte_imp.shape[0],
        accuracy_score(y_test, yp),
        f1_score(y_test, yp, average="macro"),
        t,
    ]
)


In [49]:
# OBLIGATORIO: Añade otro método de imputación básico
# D) SimpleImputer (mediana)
imp = SimpleImputer(strategy="median")
Xtr_imp = imp.fit_transform(Xtr_m)
Xte_imp = imp.transform(Xte_m)
Xtr_s, Xte_s = standardize_train_test(Xtr_imp, Xte_imp)
clf = LogisticRegression(solver="liblinear", random_state=RANDOM_STATE)
t0 = time.perf_counter()
clf.fit(Xtr_s, y_train)
t = time.perf_counter() - t0
yp = clf.predict(Xte_s)
res_imput.append(
    [
        "Imputación: mediana",
        Xtr_imp.shape[0],
        Xte_imp.shape[0],
        accuracy_score(y_test, yp),
        f1_score(y_test, yp, average="macro"),
        t,
    ]
)

In [50]:
# OBLIGATORIO: Añade otro método de imputación avanzado
# E) SimpleImputer (mas frecuente)
imp = SimpleImputer(strategy="most_frequent")
Xtr_imp = imp.fit_transform(Xtr_m)
Xte_imp = imp.transform(Xte_m)
Xtr_s, Xte_s = standardize_train_test(Xtr_imp, Xte_imp)
clf = LogisticRegression(solver="liblinear", random_state=RANDOM_STATE)
t0 = time.perf_counter()
clf.fit(Xtr_s, y_train)
t = time.perf_counter() - t0
yp = clf.predict(Xte_s)
res_imput.append(
    [
        "Imputación: más frecuente",
        Xtr_imp.shape[0],
        Xte_imp.shape[0],
        accuracy_score(y_test, yp),
        f1_score(y_test, yp, average="macro"),
        t,
    ]
)

In [51]:
# OPCIONAL: Usa un dataset de regresión, repite todos los métodos de imputación y muestra los resultados.
# No olvides adaptar todas las métricas: Accuracy/F1 solo sirven para problemas de clasificación

### ENTREGABLE: Escribe en el documento de la práctica (formato libre) un texto explicando los resultados obtenidos y decidiendo cuál es el mejor método de imputación en cada caso

In [52]:
# Mostramos resultados
df_imput = pd.DataFrame(res_imput, columns=['Tratamiento', 'n_train', 'n_test', 'Accuracy', 'F1-macro', 'tiempo_entreno_s'])
df_imput

Unnamed: 0,Tratamiento,n_train,n_test,Accuracy,F1-macro,tiempo_entreno_s
0,Eliminar filas,90,33,0.939394,0.93797,0.002151
1,Imputación: media,426,143,0.965035,0.962378,0.002143
2,Imputación: KNN (k=5),426,143,0.958042,0.955031,0.001023
3,Imputación: mediana,426,143,0.958042,0.955031,0.001065
4,Imputación: más frecuente,426,143,0.937063,0.933292,0.001062


## 4) Selección tipo filtro
Comparamos *sin selección* vs **SelectKBest** con:
- `f_classif` (general)
- `chi2` (requiere no-negatividad, debemos aplicar `MinMaxScaler` antes de usarlo)

In [53]:
# Usamos los datos SIN NaN (X_train / X_test originales)
imp = SimpleImputer()  # por seguridad
Xtr = imp.fit_transform(X_train)
Xte = imp.transform(X_test)
Xtr_s0, Xte_s0 = standardize_train_test(Xtr, Xte)

# Baseline
clf = LogisticRegression(solver="liblinear", random_state=RANDOM_STATE)
t0 = time.perf_counter()
clf.fit(Xtr_s0, y_train)
t_base2 = time.perf_counter() - t0
yp = clf.predict(Xte_s0)
acc0 = accuracy_score(y_test, yp)
f10 = f1_score(y_test, yp, average="macro")

rows = [["Sin selección", Xtr.shape[1], acc0, f10, t_base2]]


In [54]:
# SelectKBest f_classif (k=10)
k = min(10, Xtr.shape[1])
sel = SelectKBest(score_func=f_classif, k=k)
Xtr_k = sel.fit_transform(Xtr_s0, y_train)
Xte_k = sel.transform(Xte_s0)
clf = LogisticRegression(solver="liblinear", random_state=RANDOM_STATE)
t0 = time.perf_counter()
clf.fit(Xtr_k, y_train)
t1 = time.perf_counter() - t0
yp = clf.predict(Xte_k)
rows.append(
    [
        f"SelectKBest f_classif (k={k})",
        k,
        accuracy_score(y_test, yp),
        f1_score(y_test, yp, average="macro"),
        t1,
    ]
)


In [55]:
# SelectKBest chi2 (k=10) → MinMax
from sklearn.preprocessing import MinMaxScaler
mm = MinMaxScaler()
Xtr_mm = mm.fit_transform(Xtr)
Xte_mm = mm.transform(Xte)
sel = SelectKBest(score_func=chi2, k=k)
Xtr_k2 = sel.fit_transform(Xtr_mm, y_train)
Xte_k2 = sel.transform(Xte_mm)
clf = LogisticRegression(solver='liblinear', random_state=RANDOM_STATE)
t0 = time.perf_counter(); clf.fit(Xtr_k2, y_train); t2 = time.perf_counter() - t0
yp = clf.predict(Xte_k2)
rows.append([f'SelectKBest chi2 (k={k})', k, accuracy_score(y_test, yp), f1_score(y_test, yp, average='macro'), t2])



In [56]:
# OBLIGATORIO: Prueba con diferentes valores de n_features
from sklearn.preprocessing import MinMaxScaler
k_values = [10, 20, 30, 40]
for i in k_values:
	sel = SelectKBest(score_func=f_classif, k=i)
	Xtr_k = sel.fit_transform(Xtr_s0, y_train)
	Xte_k = sel.transform(Xte_s0)
	clf = LogisticRegression(solver="liblinear", random_state=RANDOM_STATE)
	t0 = time.perf_counter()
	clf.fit(Xtr_k, y_train)
	t1 = time.perf_counter() - t0
	yp = clf.predict(Xte_k)
	rows.append(
    	[
        	f"SelectKBest f_classif (k={i})",
        	k,
        	accuracy_score(y_test, yp),
        	f1_score(y_test, yp, average="macro"),
        	t1,
    	]
	)
	mm = MinMaxScaler()
	Xtr_mm = mm.fit_transform(Xtr)
	Xte_mm = mm.transform(Xte)
	sel = SelectKBest(score_func=chi2, k=i)
	Xtr_k2 = sel.fit_transform(Xtr_mm, y_train)
	Xte_k2 = sel.transform(Xte_mm)
	clf = LogisticRegression(solver='liblinear', random_state=RANDOM_STATE)
	t0 = time.perf_counter(); clf.fit(Xtr_k2, y_train); t2 = time.perf_counter() - t0
	yp = clf.predict(Xte_k2)
	rows.append([f'SelectKBest chi2 (k={i})', k, accuracy_score(y_test, yp), f1_score(y_test, yp, average='macro'), t2])


In [57]:
pd.DataFrame(
    rows,
    columns=["Tratamiento", "n_features", "Accuracy", "F1-macro", "tiempo_entreno_s"],
)

Unnamed: 0,Tratamiento,n_features,Accuracy,F1-macro,tiempo_entreno_s
0,Sin selección,30,0.958042,0.955031,0.001146
1,SelectKBest f_classif (k=10),10,0.951049,0.94733,0.000852
2,SelectKBest chi2 (k=10),10,0.93007,0.924451,0.000507
3,SelectKBest f_classif (k=10),10,0.951049,0.94733,0.000781
4,SelectKBest chi2 (k=10),10,0.93007,0.924451,0.000368
5,SelectKBest f_classif (k=20),10,0.965035,0.962067,0.00056
6,SelectKBest chi2 (k=20),10,0.944056,0.939045,0.00046
7,SelectKBest f_classif (k=30),10,0.958042,0.955031,0.000806
8,SelectKBest chi2 (k=30),10,0.937063,0.931121,0.000655
9,SelectKBest f_classif (k=40),10,0.958042,0.955031,0.000882


In [58]:
# OBLIGATORIO: Usa un segundo problema de clasificación y repite todo
# Cargamos un problema de clasificación: Breast Cancer (binaria)
from sklearn.datasets import load_wine


data2 = load_wine()
X2, y2 = data2.data, data2.target
print('X shape:', X2.shape, '| y shape:', y2.shape)
rows=[]
# Partición train/test estratificada
X_train2, X_test2, y_train2, y_test2 = train_test_split(
    X2, y2, test_size=0.25, stratify=y2, random_state=RANDOM_STATE
)
print('Train:', X_train2.shape, '| Test:', X_test2.shape)

imp2 = SimpleImputer()  # por seguridad
Xtr2 = imp.fit_transform(X_train2)
Xte2 = imp.transform(X_test2)
Xtr_s02, Xte_s02 = standardize_train_test(Xtr2, Xte2)

from sklearn.preprocessing import MinMaxScaler
k_values = [5, 10, 15, 25]
for j in k_values:
	sel = SelectKBest(score_func=f_classif, k=j)
	Xtr_k = sel.fit_transform(Xtr_s02, y_train2)
	Xte_k = sel.transform(Xte_s02)
	clf = LogisticRegression(solver="liblinear", random_state=RANDOM_STATE)
	t0 = time.perf_counter()
	clf.fit(Xtr_k, y_train2)
	t1 = time.perf_counter() - t0
	yp = clf.predict(Xte_k)
	rows.append(
    	[
        	f"SelectKBest f_classif (k={j}) WINE",
        	j,
        	accuracy_score(y_test2, yp),
        	f1_score(y_test2, yp, average="macro"),
        	t1,
    	]
	)
	mm = MinMaxScaler()
	Xtr_mm = mm.fit_transform(Xtr2)
	Xte_mm = mm.transform(Xte2)
	sel = SelectKBest(score_func=chi2, k=j)
	Xtr_k2 = sel.fit_transform(Xtr_mm, y_train2)
	Xte_k2 = sel.transform(Xte_mm)
	clf = LogisticRegression(solver='liblinear', random_state=RANDOM_STATE)
	t0 = time.perf_counter(); clf.fit(Xtr_k2, y_train2); t2 = time.perf_counter() - t0
	yp = clf.predict(Xte_k2)
	rows.append([f'SelectKBest chi2 (k={j}) WINE', j, accuracy_score(y_test2, yp), f1_score(y_test2, yp, average='macro'), t2])

X shape: (178, 13) | y shape: (178,)
Train: (133, 13) | Test: (45, 13)


In [59]:
# OPCIONAL: Usa un problema de regresión, adapta lo necesario y repite todo

In [60]:

pd.DataFrame(
    rows,
    columns=["Tratamiento", "n_features", "Accuracy", "F1-macro", "tiempo_entreno_s"],
)



Unnamed: 0,Tratamiento,n_features,Accuracy,F1-macro,tiempo_entreno_s
0,SelectKBest f_classif (k=5) WINE,5,0.955556,0.956654,0.000458
1,SelectKBest chi2 (k=5) WINE,5,0.977778,0.979497,0.000313
2,SelectKBest f_classif (k=10) WINE,10,1.0,1.0,0.00035
3,SelectKBest chi2 (k=10) WINE,10,0.977778,0.979497,0.000315
4,SelectKBest f_classif (k=15) WINE,15,1.0,1.0,0.000375
5,SelectKBest chi2 (k=15) WINE,15,1.0,1.0,0.000335
6,SelectKBest f_classif (k=25) WINE,25,1.0,1.0,0.000376
7,SelectKBest chi2 (k=25) WINE,25,1.0,1.0,0.000334


### ENTREGABLE: Escribe en el documento de la práctica (formato libre) un texto explicando los resultados obtenidos y decidiendo cuál es el mejor número de características en cada caso

## 5) RFECV
Usamos **RFECV** para encontrar automáticamente cuántas características dejar. Después reentrenamos una RL con esas características.

In [18]:
imp = SimpleImputer(); Xtr = imp.fit_transform(X_train); Xte = imp.transform(X_test)
Xtr_s, Xte_s = standardize_train_test(Xtr, Xte)

est = LogisticRegression(solver='liblinear', random_state=RANDOM_STATE)
rfecv = RFECV(estimator=est, step=2, cv=5, scoring='f1_macro', n_jobs=-1)
t0 = time.perf_counter(); rfecv.fit(Xtr_s, y_train); t_sel = time.perf_counter() - t0
nsel = int(getattr(rfecv, 'n_features_', Xtr.shape[1]))

Xtr_sel = rfecv.transform(Xtr_s); Xte_sel = rfecv.transform(Xte_s)
final = LogisticRegression(solver='liblinear', random_state=RANDOM_STATE)
t0 = time.perf_counter(); final.fit(Xtr_sel, y_train); t_fit = time.perf_counter() - t0
yp = final.predict(Xte_sel)

print('RFECV')
print('n_features seleccionadas:', nsel)
print(f'Accuracy: {accuracy_score(y_test, yp):.4f}  |  F1-macro: {f1_score(y_test, yp, average="macro"):.4f}  |  tiempo_total: {t_sel + t_fit:.3f}s')

RFECV
n_features seleccionadas: 6
Accuracy: 0.9441  |  F1-macro: 0.9400  |  tiempo_total: 1.093s


In [19]:
# OBLIGATORIO: Compara RFECV con filtro en las mismas condiciones (mismo dataset, misma imputación/escalado, mismo número de características) y explica cuál es mejor
imp = SimpleImputer(); Xtr = imp.fit_transform(X_train); Xte = imp.transform(X_test)
Xtr_s, Xte_s = standardize_train_test(Xtr, Xte)

est = LogisticRegression(solver='liblinear', random_state=RANDOM_STATE)
filt = SelectKBest(score_func=f_regression, k=6)
t0 = time.perf_counter();filt.fit(Xtr_s, y_train); t_sel = time.perf_counter() - t0
nsel = 0
features = filt.get_support()
for i in features:
	if i:
		nsel += 1
print('SelectKBest con f_regression')
print('n_features: ', nsel)

SelectKBest con f_regression
n_features:  6


In [20]:
# OBLIGATORIO: Eligen RFECV y filtro las mismas variables?
maks_rfe = rfecv.get_support()
mask_filter = filt.get_support()

rfe = rfecv.get_feature_names_out()
fil = filt.get_feature_names_out()

for i in range(6):
	if rfe[i] != fil[i]:
		print("No escogen las mismas columnas")
		print(rfe, fil)
		break

No escogen las mismas columnas
['x7' 'x20' 'x21' 'x22' 'x23' 'x27'] ['x0' 'x2' 'x7' 'x20' 'x22' 'x27']


In [21]:
# OPCIONAL: Repite todo para un problema de regresión

### ENTREGABLE: Escribe en el documento de la práctica (formato libre) un texto explicando los resultados obtenidos

## 6) SelectFromModel
Primero seleccionamos características y luego reentrenamos para comparar solo el efecto de la selección.

In [None]:
imp = SimpleImputer()
Xtr = imp.fit_transform(X_train)
Xte = imp.transform(X_test)
Xtr_s, Xte_s = standardize_train_test(Xtr, Xte)


In [None]:
rows = []
# L1 como selector
sel1 = SelectFromModel(
    LogisticRegression(penalty="l1", solver="liblinear", random_state=RANDOM_STATE)
)
t0 = time.perf_counter()
sel1.fit(Xtr_s, y_train)
t_sel1 = time.perf_counter() - t0
Xtr_sel = sel1.transform(Xtr_s)
Xte_sel = sel1.transform(Xte_s)
clf = LogisticRegression(penalty="l2", solver="liblinear", random_state=RANDOM_STATE)
t0 = time.perf_counter()
clf.fit(Xtr_sel, y_train)
t_fit1 = time.perf_counter() - t0
yp = clf.predict(Xte_sel)
rows.append(
    [
        "SFM(L1 LR) + LR L2",
        int(sel1.get_support().sum()),
        accuracy_score(y_test, yp),
        f1_score(y_test, yp, average="macro"),
        t_sel1 + t_fit1,
    ]
)


In [None]:
# RandomForest como selector
sel2 = SelectFromModel(
    RandomForestClassifier(n_estimators=200, random_state=RANDOM_STATE, n_jobs=-1)
)
t0 = time.perf_counter()
sel2.fit(Xtr_s, y_train)
t_sel2 = time.perf_counter() - t0
Xtr_sel = sel2.transform(Xtr_s)
Xte_sel = sel2.transform(Xte_s)
clf = LogisticRegression(penalty="l2", solver="liblinear", random_state=RANDOM_STATE)
t0 = time.perf_counter()
clf.fit(Xtr_sel, y_train)
t_fit2 = time.perf_counter() - t0
yp = clf.predict(Xte_sel)
rows.append(
    [
        "SFM(RandomForest) + LR L2",
        int(sel2.get_support().sum()),
        accuracy_score(y_test, yp),
        f1_score(y_test, yp, average="macro"),
        t_sel2 + t_fit2,
    ]
)


In [None]:
# OBLIGATORIO: Compara los tres métodos de selección de características en las mismas condiciones 
# (usando el mismo dataset, misma imputación/escalado, mismo número de características) y explica cuál es mejor
from sklearn.ensemble import AdaBoostClassifier


sel3 = SelectFromModel(
	AdaBoostClassifier(n_estimators=200, random_state=RANDOM_STATE)
)
t0 = time.perf_counter()
sel3.fit(Xtr_s, y_train)
t_sel3 = time.perf_counter() - t0
Xtr_sel = sel3.transform(Xtr_s)
Xte_sel = sel3.transform(Xte_s)
clf = LogisticRegression(penalty="l2", solver="liblinear", random_state=RANDOM_STATE)
t0 = time.perf_counter()
clf.fit(Xtr_sel, y_train)
t_fit3 = time.perf_counter() - t0
yp = clf.predict(Xte_sel)
rows.append(
    [
        "SFM(AdaBoost) + LR L2",
        int(sel3.get_support().sum()),
        accuracy_score(y_test, yp),
        f1_score(y_test, yp, average="macro"),
        t_sel3 + t_fit3,
    ]
)
pd.DataFrame(rows, columns=['Tratamiento', 'n_features', 'Accuracy', 'F1-macro', 'tiempo_total_s'])

Unnamed: 0,Tratamiento,n_features,Accuracy,F1-macro,tiempo_total_s
0,SFM(L1 LR) + LR L2,14,0.958042,0.955031,0.00419
1,SFM(RandomForest) + LR L2,9,0.951049,0.94733,0.093798
2,SFM(AdaBoost) + LR L2,13,0.958042,0.95467,0.206191


In [None]:
# OBLIGATORIO: Elige este método las mismas variables que los anteriores? 
sel_1 = sel1.get_feature_names_out()
sel_2 = sel2.get_feature_names_out()
sel_3 = sel3.get_feature_names_out()

for i in range(6):
	if sel_1[i] != sel_2[i] and sel_1[i] != sel_3[i] and sel_2[i] != sel_3[i]:
		print("No escogen las mismas columnas")
		print(sel_1)
		print(sel_2)
		print(sel_3)
		break

No escogen las mismas columnas
['x6' 'x7' 'x9' 'x10' 'x14' 'x15' 'x18' 'x20' 'x21' 'x23' 'x24' 'x26'
 'x27' 'x28']
['x2' 'x3' 'x6' 'x7' 'x20' 'x22' 'x23' 'x26' 'x27']
['x7' 'x8' 'x10' 'x12' 'x15' 'x19' 'x21' 'x22' 'x23' 'x24' 'x26' 'x27'
 'x29']


In [None]:
# OPCIONAL: Repite todo para un problema de regresión


### ENTREGABLE: Escribe en el documento de la práctica (formato libre) un texto explicando los resultados obtenidos

## 7) Selección de instancias
Reducimos deliberadamente el tamaño del conjunto de entrenamiento y comparamos con entrenar con todo el train.



- **CNN** (Condensed Nearest Neighbour): condensa el train manteniendo representantes.
- **ENN** (Edited Nearest Neighbours): elimina ejemplos conflictivos.

> Requiere imbalanced-learn: pip install imbalanced-learn.

In [None]:
from imblearn.under_sampling import CondensedNearestNeighbour, EditedNearestNeighbours

# Preprocesado (imputación + escalado con train)
imp = SimpleImputer()
Xtr = imp.fit_transform(X_train)
Xte = imp.transform(X_test)
sc = StandardScaler()
Xtr_s = sc.fit_transform(Xtr)
Xte_s = sc.transform(Xte)

rows = []


In [None]:
# A) Todo el train
clf_full = LogisticRegression(solver="liblinear", random_state=RANDOM_STATE)
t0 = time.perf_counter()
clf_full.fit(Xtr_s, y_train)
t_full = time.perf_counter() - t0
yp_full = clf_full.predict(Xte_s)
acc_full = accuracy_score(y_test, yp_full)
f1_full = f1_score(y_test, yp_full, average="macro")
rows.append(["Todo el train", Xtr_s.shape[0], acc_full, f1_full, t_full])


In [None]:
# B) CNN (condensado)
cnn = CondensedNearestNeighbour(random_state=RANDOM_STATE)
Xtr_cnn, ytr_cnn = cnn.fit_resample(Xtr_s, y_train)
clf_cnn = LogisticRegression(solver="liblinear", random_state=RANDOM_STATE)
t0 = time.perf_counter()
clf_cnn.fit(Xtr_cnn, ytr_cnn)
t_cnn = time.perf_counter() - t0
yp_cnn = clf_cnn.predict(Xte_s)
rows.append(
    [
        "CNN (condensado)",
        Xtr_cnn.shape[0],
        accuracy_score(y_test, yp_cnn),
        f1_score(y_test, yp_cnn, average="macro"),
        t_cnn,
    ]
)

In [None]:
# C) ENN (edición)
enn = EditedNearestNeighbours()
Xtr_enn, ytr_enn = enn.fit_resample(Xtr_s, y_train)
clf_enn = LogisticRegression(solver="liblinear", random_state=RANDOM_STATE)
t0 = time.perf_counter()
clf_enn.fit(Xtr_enn, ytr_enn)
t_enn = time.perf_counter() - t0
yp_enn = clf_enn.predict(Xte_s)
rows.append(
    [
        "ENN (edición)",
        Xtr_enn.shape[0],
        accuracy_score(y_test, yp_enn),
        f1_score(y_test, yp_enn, average="macro"),
        t_enn,
    ]
)


In [None]:

pd.DataFrame(
    rows,
    columns=[
        "Tratamiento",
        "n_instancias_train",
        "Accuracy",
        "F1-macro",
        "tiempo_entreno_s",
    ],
)


Unnamed: 0,Tratamiento,n_instancias_train,Accuracy,F1-macro,tiempo_entreno_s
0,Todo el train,426,0.958042,0.955031,0.003766
1,CNN (condensado),201,0.951049,0.948116,0.000794
2,ENN (edición),408,0.951049,0.948116,0.002671


In [None]:
# OBLIGATORIO: Usa un nuevo dataset de clasificación y repítelo todo
from imblearn.under_sampling import CondensedNearestNeighbour, EditedNearestNeighbours
from sklearn.datasets import fetch_covtype

data = fetch_covtype()
X, y = data.data, data.target
print('X shape:', X.shape, '| y shape:', y.shape)

# Partición train/test estratificada
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=RANDOM_STATE
)
print('Train:', X_train.shape, '| Test:', X_test.shape)

# Preprocesado (imputación + escalado con train)
imp = SimpleImputer()
Xtr = imp.fit_transform(X_train)
Xte = imp.transform(X_test)
sc = StandardScaler()
Xtr_s = sc.fit_transform(Xtr)
Xte_s = sc.transform(Xte)

rows = []

# A) Todo el train
clf_full = LogisticRegression(solver="liblinear", random_state=RANDOM_STATE)
t0 = time.perf_counter()
clf_full.fit(Xtr_s, y_train)
t_full = time.perf_counter() - t0
yp_full = clf_full.predict(Xte_s)
acc_full = accuracy_score(y_test, yp_full)
f1_full = f1_score(y_test, yp_full, average="macro")
rows.append(["Todo el train", Xtr_s.shape[0], acc_full, f1_full, t_full])
# B) CNN (condensado)
cnn = CondensedNearestNeighbour(random_state=RANDOM_STATE)
Xtr_cnn, ytr_cnn = cnn.fit_resample(Xtr_s, y_train)
clf_cnn = LogisticRegression(solver="liblinear", random_state=RANDOM_STATE)
t0 = time.perf_counter()
clf_cnn.fit(Xtr_cnn, ytr_cnn)
t_cnn = time.perf_counter() - t0
yp_cnn = clf_cnn.predict(Xte_s)
rows.append(
    [
        "CNN (condensado)",
        Xtr_cnn.shape[0],
        accuracy_score(y_test, yp_cnn),
        f1_score(y_test, yp_cnn, average="macro"),
        t_cnn,
    ]
)

# C) ENN (edición)
enn = EditedNearestNeighbours()
Xtr_enn, ytr_enn = enn.fit_resample(Xtr_s, y_train)
clf_enn = LogisticRegression(solver="liblinear", random_state=RANDOM_STATE)
t0 = time.perf_counter()
clf_enn.fit(Xtr_enn, ytr_enn)
t_enn = time.perf_counter() - t0
yp_enn = clf_enn.predict(Xte_s)
rows.append(
    [
        "ENN (edición)",
        Xtr_enn.shape[0],
        accuracy_score(y_test, yp_enn),
        f1_score(y_test, yp_enn, average="macro"),
        t_enn,
    ]
)


X shape: (581012, 54) | y shape: (581012,)
Train: (435759, 54) | Test: (145253, 54)


In [None]:
pd.DataFrame(
    rows,
    columns=[
        "Tratamiento",
        "n_instancias_train",
        "Accuracy",
        "F1-macro",
        "tiempo_entreno_s",
    ],
)

Unnamed: 0,Tratamiento,n_instancias_train,Accuracy,F1-macro,tiempo_entreno_s
0,Todo el train,435759,0.715249,0.47883,98.22743
1,CNN (condensado),4274,0.207631,0.187563,0.203019
2,ENN (edición),364485,0.71573,0.487959,25.255179


In [None]:
# OPCIONAL: Usa un dataset de regresión

### ENTREGABLE: Escribe en el documento de la práctica (formato libre) un texto explicando los resultados obtenidos

## Demostración de Pipeline (sin entregables)
Esto no es necesario para entender los métodos; simplemente muestra cómo encadenar pasos.

In [None]:
from sklearn.pipeline import Pipeline

pipe = Pipeline(
    [
        ("imp", SimpleImputer()),
        ("sc", StandardScaler()),
        (
            "sel",
            SelectFromModel(
                LogisticRegression(
                    penalty="l1", solver="liblinear", random_state=RANDOM_STATE
                )
            ),
        ),
        (
            "clf",
            LogisticRegression(
                penalty="l2", solver="liblinear", random_state=RANDOM_STATE
            ),
        ),
    ]
)
t0 = time.perf_counter()
pipe.fit(X_train, y_train)
t = time.perf_counter() - t0
yp = pipe.predict(X_test)
print(
    f"Pipeline → Accuracy: {accuracy_score(y_test, yp):.4f} | F1-macro: {f1_score(y_test, yp, average='macro'):.4f} | tiempo: {t:.3f}s"
)
