# Laboratorio 2 - Clasificador de rostros

**Autores**

*   [214205] Enrique, Oliva
*   [192680] Martina, Severo
*   [229484] Santiago, Tonarelli

**Formato de entrega**:

* Esta misma notebook en formato .ipynb
* Cambiar el nombre de la notebook por NumEst1_NumEst2_NumEst3_Lab_1.
* Es importante que la notebook pueda ejecutarse sin problemas al seleccionar 'Ejecutar todo'.
* Se considerará que sus datos pueden estar en otra localización.


**Plazo de entrega**: hasta el Domingo 16/06 a las 23:59 horas a través de Aulas.

**Objetivo**: implementar un algoritmo de clasificación que permita predecir si una imagen dada es un rostro o no.

## Librerías

In [1]:
%pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 24.1
[notice] To update, run: c:\Users\Santiago\.pyenv\pyenv-win\versions\3.12.1\python.exe -m pip install --upgrade pip


In [2]:
import os
from tqdm import tqdm
from time import time

import random
import numpy as np
import matplotlib.pyplot as plt

from skimage.exposure import equalize_hist

from skimage.transform import integral_image
from skimage.feature import haar_like_feature, haar_like_feature_coord

from sklearn.feature_selection import SelectPercentile, f_classif

import zipfile
import os

from random import sample
from skimage.transform import resize
from skimage import data, color
from sklearn.feature_extraction.image import PatchExtractor
from sklearn.metrics import confusion_matrix, f1_score, classification_report

from sklearn.model_selection import train_test_split

## Funciones auxiliares

In [3]:
def extract_feature_image(img, feature_type=None, feature_coord=None):
    """Extrae las Haar features de la imagen"""
    ii = integral_image(img)
    return haar_like_feature(ii, 0, 0, ii.shape[0], ii.shape[1],
                             feature_type=feature_type,
                             feature_coord=feature_coord)


In [4]:
# Función para imprimir resultados de la F1 Score
def print_f1_results(accuracies, text=""):
    print("----------------------------------")
    print(text)
    print("----------------------------------")

    for i, scores in enumerate(accuracies, 1):
        print(f"Fold {i}:")
        print(f"Clase 0: {scores[0]}")
        print(f"Clase 1: {scores[1]}")
        print("")

    average_scores = np.mean(accuracies, axis=0)
    print(f"Promedio F1 por Clase:")
    print(f"Clase 0: {average_scores[0]}")
    print(f"Clase 1: {average_scores[1]}")


In [None]:
# Función para extraer porciones de una imagen
def extract_patches(img, N, scale=1.0, patch_size=(19,19), random_state=0):
    # Calcula el tamaño del parche extraído basado en el factor de escala dado
    H = img.shape[0]
    W = img.shape[1]
    H_patch = min(H , int(scale * patch_size[0]))
    W_patch = min(W , int(scale * patch_size[1]))
    extracted_patch_size = (H_patch, W_patch)

    # Inicializa un objeto PatchExtractor con el tamaño de parche calculado,
    # el número máximo de parches, y una semilla de estado aleatorio
    extractor = PatchExtractor(patch_size=extracted_patch_size, max_patches=N, random_state=random_state)

    # Extrae parches de la imagen dada
    # img[np.newaxis] se utiliza la entrada de PatchExtractor es un conjunto de imágenes
    patches = extractor.transform(img[np.newaxis])

    # Si el factor de escala no es 1, redimensiona cada parche extraído
    # al tamaño del parche original
    if scale != 1:
        patches = np.array([resize(patch, patch_size) for patch in patches])

    # Devuelve la lista de parches extraídos (y posiblemente redimensionados)
    return patches


In [None]:
# True Positive Rate
def tpr_scorer(clf, X, y):
    y_pred = clf.predict(X)
    cm = confusion_matrix(y, y_pred)
    tpr = cm[1,1]/(cm[1,1]+cm[1,0])
    return tpr

# False Positive Rate
def fpr_scorer(clf, X, y):
    y_pred = clf.predict(X)
    cm = confusion_matrix(y, y_pred)
    fpr = cm[0,1]/(cm[0,0]+cm[0,1])
    return fpr


## Datos

**Imágenes de rostros para entrenamiento:**
* formato .pgm
* 64 x 64 píxeles
* 0 - 255 rango de valores
* N = 12833 imágenes


**Imágenes de test (rostros y fondos sin etiquetar):**
* formato .pgm
* 19 x 19 píxeles
* 0 - 255 rango de valores
* N = 7920 imágenes 

In [5]:
# # colab
# !unzip /content/CBCL.zip
# !tar -xvzf /content/face.test.tar.gz
# !tar -xvzf /content/face.train.tar.gz

# # vscode
dataset_file = 'content/obligatorio-mlia-2024.zip'
background_file = 'content/background.zip'

dataset_extract_path = 'content/obligatorio-mlia-2024'
background_extract_path = 'content/background'

os.makedirs(background_extract_path, exist_ok=True)
os.makedirs(dataset_extract_path, exist_ok=True)

with zipfile.ZipFile(dataset_file, 'r') as zip_ref:
    zip_ref.extractall(dataset_extract_path)

with zipfile.ZipFile(background_file, 'r') as zip_ref:
    zip_ref.extractall(background_extract_path)


In [5]:
suffix = '.pgm'

faces = os.listdir('{dataset_extract_path}/Faces/Faces')
faces = [filename for filename in faces if filename.endswith(suffix)]

print(f'# Faces: {len(faces)}')


In [None]:
# Tamaño de las imágenes de rostros
size = (19,19)

n_faces = 1000
sample_faces = sample(faces, n_faces)

im_faces = []
for filename in tqdm(sample_faces):
    path = '{dataset_extract_path}/Faces/Faces/' + filename
    with open(path, 'rb') as pgmf:
        image = plt.imread(pgmf)
        image = resize(image, size)
    im_faces.append(image)


In [None]:
# Visualización de las imágenes de rostros
K = 16
indices = sample(range(n_faces),k=K)

fig, ax = plt.subplots(4, 4, figsize=(3, 3), subplot_kw=dict(xticks=[], yticks=[]))
axes = ax.ravel()

for i in range(K):
    idx = indices[i]
    image = im_faces[idx]
    axes[i].imshow(image, cmap='gray')
    axes[i].axis('off')
    
plt.tight_layout()
plt.show()

In [None]:
# Tomamos algunas imágenes de sklearn
imgs_skimage = ['text',
                'coins',
                'moon',
                'page',
                'clock',
                'immunohistochemistry',
                'chelsea',
                'coffee',
                'hubble_deep_field'
                ]

backgrounds_big = []
for name in imgs_skimage:
    image = getattr(data, name)()
    if len(image.shape) == 3 and image.shape[2] == 3:  # Chequeamos si la imagen es RGB
        image = color.rgb2gray(image)
    backgrounds_big.append(image)

# Imagenes de fondo adicionales
for i in range(31):
    filename = {background_extract_path} + str(i) + '.jpg'
    image = plt.imread(filename)
    image = color.rgb2gray(image)
    backgrounds_big.append(image)

In [None]:
# Visualización de las imágenes de fondos
fig, ax = plt.subplots(4, 10, figsize=(20, 5), subplot_kw=dict(xticks=[], yticks=[]))
axes = ax.ravel()

for i in range(len(backgrounds_big)):
    image = backgrounds_big[i]
    axes[i].imshow(image, cmap='gray')
    axes[i].axis('off')

plt.tight_layout()

In [None]:

# Extraer parches de las imágenes de fondo
scales = [1,1.5,2,2.5]
proportion = 2
num_patches = int((proportion * n_faces)/(len(scales) * len(backgrounds_big)))

im_backgrounds = np.vstack(
    [extract_patches(im_bkgnd, num_patches, scale, random_state=42)
    for im_bkgnd in tqdm(backgrounds_big)
    for scale in scales]
    )

print(f'# Back: {len(im_backgrounds)}')

In [None]:
# Visualización de una muestra
fig, ax = plt.subplots(3, 5, figsize=(5,3))
for i, axi in enumerate(ax.flat):
    axi.imshow(im_backgrounds[100 * i], cmap='gray')
    axi.axis('off')
plt.tight_layout()
plt.show()

In [None]:
im_faces = np.array(im_faces)

print(f'Faces shape: {im_faces.shape}')
print(f'Backs shape: {im_backgrounds.shape}')

In [None]:
Im_train = np.vstack([im_faces, im_backgrounds])
print(f'Images shape: {Im_train.shape}')

In [None]:
y_train = np.array([1]*len(im_faces)+[0]*len(im_backgrounds))
print(f'y_train shape: {y_train.shape}')

## Logica anterior (TODO: borrar si no se usa)

In [7]:
# # Tomaremos una fracción de los datos. Puede ajustar estos parámetros a gusto
# f = 0.2
# n_face = int(f*len(train_faces))
# n_back = int(f*len(train_background))

# # Para mantener la proporción de background en test calculamos (para mantener una proporción balanceada entre las clases (rostros y no-rostros)):
# m = int(np.round(len(test_faces)*len(train_background)/len(train_faces)))

# print(f'# Train Faces Sample Size: {n_face}')
# print(f'# Train Back Sample Size: {n_back}')
# print(f'# m: {m}')

# Train Faces Sample Size: 485
# Train Back Sample Size: 909
# m: 884


In [8]:
# sample_train_faces = random.sample(train_faces,n_face)

# Im_train = []
# for filename in tqdm(sample_train_faces):
#     # path = '/content/train/face/' + filename # colab
#     path = 'train/face/' + filename # vscode
#     with open(path, 'rb') as pgmf:
#         image = plt.imread(pgmf)
#     Im_train.append(image)

# n_train_faces = len(Im_train)
# y_train = [1]*n_train_faces # Cada imagen de rostro se etiqueta con un 1

100%|██████████| 485/485 [00:01<00:00, 366.06it/s]


In [9]:
# sample_train_background = random.sample(train_background,n_back)

# for filename in tqdm(sample_train_background):
#     # path = "/content/train/non-face/" + filename # colab
#     path = "train/non-face/" + filename # vscode
#     with open(path, 'rb') as pgmf:
#         image = plt.imread(pgmf)
#     Im_train.append(image)

# n_train_background = len(Im_train)-n_train_faces
# y_train = y_train + [0]*n_train_background # Cada imagen de no-rostro se etiqueta con un 0

100%|██████████| 909/909 [00:02<00:00, 364.07it/s]


In [10]:
# print(f'# Train: {len(Im_train)}, {len(y_train)}')

# Train: 1394, 1394


In [11]:
# Im_test = []
# for filename in tqdm(test_faces):
#     # path = "/content/test/face/" + filename # colab
#     path = "test/face/" + filename # vscode
#     with open(path, 'rb') as pgmf:
#         image = plt.imread(pgmf)
#     Im_test.append(image)

# n_test_faces = len(Im_test)
# y_test = [1]*n_test_faces

100%|██████████| 472/472 [00:01<00:00, 354.99it/s]


In [12]:
# sample_test_background = random.sample(test_background,m)

# for filename in tqdm(sample_test_background):
#     # path = "/content/test/non-face/" + filename # colab
#     path = "test/non-face/" + filename # vscode
#     with open(path, 'rb') as pgmf:
#         image = plt.imread(pgmf)
#     Im_test.append(image)

# n_test_background = len(Im_test)-n_test_faces
# y_test = y_test + [0]*n_test_background

100%|██████████| 884/884 [00:02<00:00, 374.47it/s]


In [13]:
# print(f'# Test: {len(Im_test)}, {len(y_test)}')

# Test: 1356, 1356


## Histogram equalization

In [None]:
Im_train, Im_test, y_train, y_test = train_test_split(Im, 
                                                      y, 
                                                      test_size=0.5, 
                                                      shuffle=True, 
                                                      stratify=y, 
                                                      random_state = 42)


In [None]:
print(f'Im_train shape: {Im_train.shape}')
print(f'Im_test shape: {Im_test.shape}')
print(f'y_train shape: {y_test.shape}')
print(f'y_test shape: {y_test.shape}')

In [14]:
# Normalización de las imágenes de entrenamiento y prueba
Im_train_norm = [equalize_hist(image) for image in Im_train]
Im_test_norm = [equalize_hist(image) for image in Im_test]

## Matriz de features

### Calculamos y seleccionamos las mejores features en entrenamiento

In [15]:
X_train = [extract_feature_image(img) for img in tqdm(Im_train_norm)]
X_train = np.array(X_train)

100%|██████████| 1394/1394 [01:25<00:00, 16.25it/s]


In [16]:
# Pueden guardar la matriz si lo desean
np.save('X_train', X_train)

In [17]:
# Y cargarla posteriormente
X_train = np.load('X_train.npy')

In [18]:
X_train.shape

(1394, 63960)

In [19]:
# Selección de características en train
print("Seleccionando las features de mayor dependencia lineal con y")
t_start = time()
# SelectPercentile: selecciona las mejores características basadas en una prueba estadística.
# f_classif: mide la dependencia lineal entre dos conjuntos de datos.
# percentile=1: Selecciona el 1% de las mejores características.
# fit(X_train, y_train): Ajusta el selector de características a los datos de entrenamiento X_train y y_train.
# get_support(indices=True): Obtiene los índices de las características seleccionadas.
f_indices = SelectPercentile(f_classif, percentile=1).fit(X_train, y_train).get_support(indices=True)
t = time() - t_start
X_train = X_train[:,f_indices]
print("Seleccionadas %d features potenciales" % X_train.shape[1])
print(f'Tiempo: {t} segundos')

Seleccionando las features de mayor dependencia lineal con y
Seleccionadas 640 features potenciales
Tiempo: 0.7966973781585693 segundos


In [None]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import AdaBoostClassifier

def select_features_with_adaboost(X, y):
    """
    Selecciona las características más relevantes utilizando AdaBoostClassifier.

    Parámetros:
    X: numpy array
        Conjunto de datos de entrenamiento.
    y: numpy array
        Etiquetas del conjunto de datos de entrenamiento.

    Retorna:
    X_selected: numpy array
        Conjunto de datos de entrenamiento con las características seleccionadas.
    model: AdaBoostClassifier
        El modelo ajustado de AdaBoostClassifier.
    tiempo: float
        Tiempo de ejecución del proceso de selección.
    """
    print("Seleccionando las características más importantes con AdaBoostClassifier")
    t_start = time()
    
    # Inicializar el clasificador AdaBoost
    model = AdaBoostClassifier(n_estimators=50, random_state=42)
    model.fit(X, y)
    
    # Seleccionar características basadas en su importancia
    selector = SelectFromModel(model, prefit=True)
    X_selected = selector.transform(X)
    
    tiempo = time() - t_start
    
    print("Seleccionadas %d características potenciales" % X_selected.shape[1])
    print(f'Tiempo: {tiempo:.2f} segundos')
    
    return X_selected, model, tiempo

# Selección de características en el conjunto de entrenamiento con AdaBoost
X_train, model, t = select_features_with_adaboost(X_train, y_train)

# Aplicar la selección de características al conjunto de prueba
X_test = X_test[:, model.feature_importances_ > 0]

### Calculamos dichas features para test

In [20]:
# haar_like_feature_coord(): genera coordenadas y tipos de características Haar para una ventana de búsqueda especificada. En este caso 19x19 píxeles.
feature_coord, feature_type = haar_like_feature_coord(width=19,
                                                      height=19,
                                                      )

In [21]:
t_start = time()
X_test = [extract_feature_image(img,
                                feature_type=feature_type[f_indices],
                                feature_coord=feature_coord[f_indices]) for img in tqdm(Im_test_norm)]
t = time() - t_start
X_test = np.array(X_test)

100%|██████████| 1356/1356 [00:00<00:00, 3637.41it/s]


In [22]:
print(f'Tiempo: {t} segundos')
print(f'Shape X_test: {X_test.shape}')

Tiempo: 0.37479424476623535 segundos
Shape X_test: (1356, 640)


# Clasificadores - Evaluación con Holdout

### Random Forest

In [23]:
from sklearn.ensemble import RandomForestClassifier

# Crear el clasificador Random Forest
rf_classifier = RandomForestClassifier(n_estimators=600, random_state=42)

# Entrenar el clasificador con el conjunto de entrenamiento y las características extraídas
rf_classifier.fit(X_train, y_train)

# Evaluación del Modelo
from sklearn.metrics import accuracy_score, f1_score, precision_score, classification_report, confusion_matrix

# Predecir las etiquetas de las imágenes de prueba
y_pred = rf_classifier.predict(X_test)

accuracy_rf = accuracy_score(y_test, y_pred)
f1_score_rf = f1_score(y_test, y_pred)
precision_rf = precision_score(y_test, y_pred)

print("Accuracy:", accuracy_rf)
print("F1 Score:", f1_score_rf)
print("Precision:", precision_rf)

print("\nReporte de clasificación:")
print(classification_report(y_test, y_pred))

print("\nMatriz de confusión:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.7234513274336283
F1 Score: 0.40381558028616854
Precision: 0.8089171974522293

Reporte de clasificación:
              precision    recall  f1-score   support

           0       0.71      0.97      0.82       884
           1       0.81      0.27      0.40       472

    accuracy                           0.72      1356
   macro avg       0.76      0.62      0.61      1356
weighted avg       0.75      0.72      0.68      1356


Matriz de confusión:
[[854  30]
 [345 127]]


### Gradient Boosting

In [24]:
from sklearn.ensemble import GradientBoostingClassifier

gb_classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)

gb_classifier.fit(X_train, y_train)

# Evaluación del modelo
y_pred_gb = gb_classifier.predict(X_test)

accuracy_gb = accuracy_score(y_test, y_pred_gb)
f1_score_gb = f1_score(y_test, y_pred_gb)
precision_gb = precision_score(y_test, y_pred_gb)

print("Accuracy:", accuracy_gb)
print("F1 Score:", f1_score_gb)
print("Precision:", precision_gb)

print("\nReporte de clasificación:")
print(classification_report(y_test, y_pred_gb))

print("\nMatriz de confusión:")
print(confusion_matrix(y_test, y_pred_gb))


Accuracy: 0.724188790560472
F1 Score: 0.415625
Precision: 0.7916666666666666

Reporte de clasificación:
              precision    recall  f1-score   support

           0       0.71      0.96      0.82       884
           1       0.79      0.28      0.42       472

    accuracy                           0.72      1356
   macro avg       0.75      0.62      0.62      1356
weighted avg       0.74      0.72      0.68      1356


Matriz de confusión:
[[849  35]
 [339 133]]


### Árboles de decisión

In [25]:
from sklearn.tree import DecisionTreeClassifier

dt_classifier = DecisionTreeClassifier(random_state=42)

dt_classifier.fit(X_train, y_train)

# Evaluación del modelo
y_pred_dt = dt_classifier.predict(X_test)

accuracy_dt = accuracy_score(y_test, y_pred_dt)
f1_score_dt = f1_score(y_test, y_pred_dt)
precision_dt = precision_score(y_test, y_pred_dt)

print("Accuracy:", accuracy_dt)
print("F1 Score:", f1_score_dt)
print("Precision:", precision_dt)

print("\nReporte de clasificación:")
print(classification_report(y_test, y_pred_dt))

print("\nMatriz de confusión:")
print(confusion_matrix(y_test, y_pred_dt))


Accuracy: 0.7359882005899705
F1 Score: 0.5149051490514905
Precision: 0.7142857142857143

Reporte de clasificación:
              precision    recall  f1-score   support

           0       0.74      0.91      0.82       884
           1       0.71      0.40      0.51       472

    accuracy                           0.74      1356
   macro avg       0.73      0.66      0.67      1356
weighted avg       0.73      0.74      0.71      1356


Matriz de confusión:
[[808  76]
 [282 190]]


### Regresión logística

In [26]:
# Probar con SGClassifier
from sklearn.linear_model import LogisticRegression

lr_classifier = LogisticRegression(max_iter=1000, random_state=42)

lr_classifier.fit(X_train, y_train)

# Evaluación del modelo
y_pred_lr = lr_classifier.predict(X_test)

accuracy_lr = accuracy_score(y_test, y_pred_lr)
f1_score_lr = f1_score(y_test, y_pred_lr)
precision_lr = precision_score(y_test, y_pred_lr)

print("Accuracy:", accuracy_lr)
print("F1 Score:", f1_score_lr)
print("Precision:", precision_lr)

print("\nReporte de clasificación:")
print(classification_report(y_test, y_pred_lr))

print("\nMatriz de confusión:")
print(confusion_matrix(y_test, y_pred_lr))


Accuracy: 0.724188790560472
F1 Score: 0.42813455657492355
Precision: 0.7692307692307693

Reporte de clasificación:
              precision    recall  f1-score   support

           0       0.72      0.95      0.82       884
           1       0.77      0.30      0.43       472

    accuracy                           0.72      1356
   macro avg       0.74      0.62      0.62      1356
weighted avg       0.74      0.72      0.68      1356


Matriz de confusión:
[[842  42]
 [332 140]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Redes neuronales

In [27]:
from sklearn.neural_network import MLPClassifier

mlp_classifier = MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)

mlp_classifier.fit(X_train, y_train)

# Evaluación del modelo
y_pred_mlp = mlp_classifier.predict(X_test)

accuracy_mlp = accuracy_score(y_test, y_pred_mlp)
f1_score_mlp = f1_score(y_test, y_pred_mlp)
precision_mlp = precision_score(y_test, y_pred_mlp)

print("Accuracy:", accuracy_mlp)
print("F1 Score:", f1_score_mlp)
print("Precision:", precision_mlp)

print("\nReporte de clasificación:")
print(classification_report(y_test, y_pred_mlp))

print("\nMatriz de confusión:")
print(confusion_matrix(y_test, y_pred_mlp))


Accuracy: 0.721976401179941
F1 Score: 0.3642495784148398
Precision: 0.8925619834710744

Reporte de clasificación:
              precision    recall  f1-score   support

           0       0.71      0.99      0.82       884
           1       0.89      0.23      0.36       472

    accuracy                           0.72      1356
   macro avg       0.80      0.61      0.59      1356
weighted avg       0.77      0.72      0.66      1356


Matriz de confusión:
[[871  13]
 [364 108]]


# Clasificadores - Evaluación con Repeated Holdout

In [28]:
# Número de repeticiones
n_repeats = 5
accuracies_rf = []
accuracies_gb = []
accuracies_dt = []
accuracies_lr = []
accuracies_mlp = []

for _ in range(n_repeats):
    # Dividir los datos en conjuntos de entrenamiento y prueba
    X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.5, random_state=42)

    # Entrenar y evaluar Random Forest
    rf_classifier.fit(X_train_split, y_train_split)
    y_pred_rf = rf_classifier.predict(X_val_split)
    accuracies_rf.append(f1_score(y_val_split, y_pred_rf, average=None))

    # Entrenar y evaluar Gradient Boosting
    gb_classifier.fit(X_train_split, y_train_split)
    y_pred_gb = gb_classifier.predict(X_val_split)
    accuracies_gb.append(f1_score(y_val_split, y_pred_gb, average=None))

    # Entrenar y evaluar Árboles de decisión
    dt_classifier.fit(X_train_split, y_train_split)
    y_pred_dt = dt_classifier.predict(X_val_split)
    accuracies_dt.append(f1_score(y_val_split, y_pred_dt, average=None))

    # Entrenar y evaluar Regresión logística
    lr_classifier.fit(X_train_split, y_train_split)
    y_pred_lr = lr_classifier.predict(X_val_split)
    accuracies_lr.append(f1_score(y_val_split, y_pred_lr, average=None))

    # Entrenar y evaluar Redes neuronales
    mlp_classifier.fit(X_train_split, y_train_split)
    y_pred_mlp = mlp_classifier.predict(X_val_split)
    accuracies_mlp.append(f1_score(y_val_split, y_pred_mlp, average=None))

print_f1_results(accuracies_rf, "Repeated Holdout F1 Scores (RF)")
print_f1_results(accuracies_gb, "Repeated Holdout F1 Scores (GB)")
print_f1_results(accuracies_dt, "Repeated Holdout F1 Scores (DT)")
print_f1_results(accuracies_lr, "Repeated Holdout F1 Scores (LR)")
print_f1_results(accuracies_mlp, "Repeated Holdout F1 Scores (MLP)")


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Repeated Holdout Accuracy Scores (RF): 0.9713261648745519
Repeated Holdout Accuracy Scores (GB): 0.96415770609319
Repeated Holdout Accuracy Scores (DT): 0.931899641577061
Repeated Holdout Accuracy Scores (LR): 0.9713261648745519
Repeated Holdout Accuracy Scores (MLP): 0.921146953405018


# Selección de modelos (búsqueda de hiperparámetros)

### Random Forest

In [29]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn.ensemble import RandomForestClassifier

param_dist = {
    'n_estimators': randint(100, 500),
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 10),
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True, False]
}

random_search_rf = RandomizedSearchCV(estimator=RandomForestClassifier(random_state=42),
                                   param_distributions=param_dist,
                                   n_iter=100,
                                   cv=5,  # Este es el parámetro para cross-validation
                                   scoring='f1',
                                   verbose=1,
                                   n_jobs=-1)

random_search_rf.fit(X_train, y_train)
cv_results_rf = random_search_rf.cv_results_
best_rf_model = random_search_rf.best_estimator_

# TODO: delete if made in next cell
# print("Mejor modelo:", best_rf_model)
# print("cv_results:", cv_results_rf)
# print("Mejores hiperparámetros:", random_search_rf.best_params_)


Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [30]:
print("Mejor modelo:", best_rf_model)
print("cv_results:", cv_results_rf)
print("Mejores hiperparámetros:", random_search_rf.best_params_)

Mejor modelo: RandomForestClassifier(bootstrap=False, max_depth=20, min_samples_leaf=2,
                       min_samples_split=3, n_estimators=206, random_state=42)
cv_results: {'mean_fit_time': array([2.16182389, 1.85785007, 4.44260931, 2.59207191, 3.37391658,
       2.55909591, 4.60522957, 2.26704545, 3.38470721, 3.11458688,
       4.70457268, 1.25481753, 1.46127753, 2.13563662, 6.57258515,
       5.35194545, 1.64882751, 3.9408658 , 2.7105514 , 2.26810169,
       2.57775822, 1.44436784, 0.59273906, 2.94273896, 6.94945016,
       3.2127264 , 4.16398811, 2.19939904, 2.92939839, 2.55268607,
       1.45219307, 7.22535534, 4.87708406, 7.19496551, 2.50363679,
       1.5448977 , 3.20092578, 0.62766371, 2.55726981, 4.88097577,
       1.77118607, 3.37309175, 2.47965097, 3.37638407, 3.04719949,
       1.46218028, 3.10493798, 2.91989765, 2.17878237, 2.20206771,
       3.09310904, 1.58659773, 1.31297703, 1.93992071, 2.86247568,
       3.0346076 , 7.71981754, 2.98883948, 3.24726195, 8.44846931,

### Gradient Boosting

In [31]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn.ensemble import GradientBoostingClassifier

param_dist_gb = {
    'n_estimators': randint(100, 500),
    'max_depth': [3, 5, 7, 10],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0]
}

random_search_gb = RandomizedSearchCV(estimator=GradientBoostingClassifier(random_state=42),
                                      param_distributions=param_dist_gb,
                                      n_iter=20,
                                      cv=3,
                                      scoring='f1',
                                      n_jobs=-1,
                                      verbose=1)

random_search_gb.fit(X_train, y_train)
cv_results_gb = random_search_gb.cv_results_
best_gb_model = random_search_gb.best_estimator_

# TODO: delete if made in next cell
# print("Mejor modelo:", best_gb_model)
# print("cv_results:", cv_results_gb)
# print("Mejores hiperparámetros:", random_search_gb.best_params_)


Fitting 3 folds for each of 20 candidates, totalling 60 fits


In [33]:
%pip install pandas

Collecting pandas
  Obtaining dependency information for pandas from https://files.pythonhosted.org/packages/22/a5/a0b255295406ed54269814bc93723cfd1a0da63fb9aaf99e1364f07923e5/pandas-2.2.2-cp312-cp312-win_amd64.whl.metadata
  Downloading pandas-2.2.2-cp312-cp312-win_amd64.whl.metadata (19 kB)
Collecting pytz>=2020.1 (from pandas)
  Obtaining dependency information for pytz>=2020.1 from https://files.pythonhosted.org/packages/9c/3d/a121f284241f08268b21359bd425f7d4825cffc5ac5cd0e1b3d82ffd2b10/pytz-2024.1-py2.py3-none-any.whl.metadata
  Downloading pytz-2024.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Obtaining dependency information for tzdata>=2022.7 from https://files.pythonhosted.org/packages/65/58/f9c9e6be752e9fcb8b6a0ee9fb87e6e7a1f6bcab2cdc73f02bb7ba91ada0/tzdata-2024.1-py2.py3-none-any.whl.metadata
  Downloading tzdata-2024.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.2.2-cp312-cp312-win_amd64.whl (11.5 MB)
   -----------------


[notice] A new release of pip is available: 23.2.1 -> 24.1
[notice] To update, run: c:\Users\Santiago\.pyenv\pyenv-win\versions\3.12.1\python.exe -m pip install --upgrade pip


In [34]:
print("Mejor modelo:", best_gb_model)

import pandas as pd

# Convertir cv_results_ a un DataFrame
results_df = pd.DataFrame(cv_results_gb)

# Mostrar las primeras filas del DataFrame para ver los resultados
print("cv_results:", results_df.head())

# print("cv_results:", cv_results_gb)
print("Mejores hiperparámetros:", random_search_gb.best_params_)

Mejor modelo: GradientBoostingClassifier(learning_rate=0.2, n_estimators=495, random_state=42,
                           subsample=0.8)
cv_results:    mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0      78.240763     10.938288         0.004503        0.000408   
1      39.734351      1.150289         0.003673        0.000626   
2      38.781362      0.207843         0.005005        0.000409   
3      44.671742      1.005994         0.004170        0.000238   
4      60.021576      2.429651         0.004667        0.000472   

   param_learning_rate  param_max_depth  param_n_estimators  param_subsample  \
0                 0.01                7                 249              1.0   
1                 0.10                5                 149              1.0   
2                 0.10                7                 186              0.6   
3                 0.20                5                 364              0.6   
4                 0.20                3         

### Árboles de decisión

In [35]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

param_grid_dt = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search_dt = GridSearchCV(estimator=DecisionTreeClassifier(random_state=42),
                              param_grid=param_grid_dt,
                              cv=5,
                              scoring='f1',
                              n_jobs=-1,
                              verbose=1)

grid_search_dt.fit(X_train, y_train)
cv_result_dt = grid_search_dt.cv_results_
best_dt_model = grid_search_dt.best_estimator_

# TODO: delete if made in next cell
# print("Mejor modelo:", best_dt_model)
# print("cv_results:", cv_result_dt)
# print("Mejores hiperparámetros:", grid_search_dt.best_params_)


Fitting 5 folds for each of 36 candidates, totalling 180 fits


In [36]:
print("Mejor modelo:", best_dt_model)
print("cv_results:", cv_result_dt)
print("Mejores hiperparámetros:", grid_search_dt.best_params_)

Mejor modelo: DecisionTreeClassifier(min_samples_leaf=4, random_state=42)
cv_results: {'mean_fit_time': array([0.57916842, 0.56826181, 0.564957  , 0.53587351, 0.53157229,
       0.47021503, 0.43798151, 0.43087468, 0.4740169 , 0.51594973,
       0.5135591 , 0.50435009, 0.47341571, 0.54677634, 0.50854311,
       0.47691073, 0.44968972, 0.45219789, 0.5369935 , 0.53428493,
       0.50205207, 0.47503209, 0.46921668, 0.50993814, 0.44399161,
       0.43367729, 0.44088187, 0.51905627, 0.5090414 , 0.4950418 ,
       0.48964357, 0.49303808, 0.4869318 , 0.44060259, 0.41814756,
       0.40474272]), 'std_fit_time': array([0.09007959, 0.07871079, 0.08879998, 0.06222754, 0.0673204 ,
       0.0436684 , 0.03729419, 0.05143915, 0.06151356, 0.0490777 ,
       0.04923622, 0.04201112, 0.03597816, 0.08060227, 0.0447399 ,
       0.04304211, 0.05007989, 0.04240586, 0.03312937, 0.05246877,
       0.04910222, 0.03591223, 0.04296075, 0.06918843, 0.03051167,
       0.04228588, 0.03471367, 0.06601044, 0.08229236, 

### Regresión logística

In [37]:
from sklearn.linear_model import LogisticRegression

param_grid_lr = {
    'C': [0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}

grid_search_lr = GridSearchCV(estimator=LogisticRegression(random_state=42, max_iter=1000),
                              param_grid=param_grid_lr,
                              cv=5,
                              scoring='f1',
                              n_jobs=-1,
                              verbose=1)

grid_search_lr.fit(X_train, y_train)
cv_results_lr = grid_search_lr.cv_results_
best_lr_model = grid_search_lr.best_estimator_

# TODO: delete if made in next cell
# print("Mejor modelo:", best_lr_model)
# print("cv_results:", cv_results_lr)
# print("Mejores hiperparámetros:", grid_search_lr.best_params_)


Fitting 5 folds for each of 8 candidates, totalling 40 fits


In [38]:
print("Mejor modelo:", best_lr_model)
print("cv_results:", cv_results_lr)
print("Mejores hiperparámetros:", grid_search_lr.best_params_)

Mejor modelo: LogisticRegression(C=0.1, max_iter=1000, random_state=42, solver='liblinear')
cv_results: {'mean_fit_time': array([0.26894403, 0.59335647, 1.52860398, 0.8105361 , 2.88873148,
       0.76805553, 2.68196554, 0.65145345]), 'std_fit_time': array([0.07677128, 0.18540821, 0.32344248, 0.16719531, 0.45146015,
       0.23979071, 0.53067692, 0.11860011]), 'mean_score_time': array([0.00551929, 0.00785642, 0.00260749, 0.00712972, 0.00160422,
       0.0036109 , 0.00139985, 0.00221577]), 'std_score_time': array([0.00484095, 0.00480786, 0.00107574, 0.00428929, 0.00037327,
       0.00140665, 0.00048942, 0.00066181]), 'param_C': masked_array(data=[0.1, 0.1, 1.0, 1.0, 10.0, 10.0, 100.0, 100.0],
             mask=[False, False, False, False, False, False, False, False],
       fill_value=1e+20), 'param_penalty': masked_array(data=['l1', 'l2', 'l1', 'l2', 'l1', 'l2', 'l1', 'l2'],
             mask=[False, False, False, False, False, False, False, False],
       fill_value='?',
            dt

### Redes neuronales

In [39]:
from sklearn.neural_network import MLPClassifier

param_grid_mlp = {
    'hidden_layer_sizes': [(50,), (100,), (50,50), (100,50)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant', 'adaptive']
}

grid_search_mlp = GridSearchCV(estimator=MLPClassifier(random_state=42, max_iter=1000),
                               param_grid=param_grid_mlp,
                               cv=5,
                               scoring='f1',
                               n_jobs=-1,
                               verbose=1)

grid_search_mlp.fit(X_train, y_train)
cv_results_mlp = grid_search_mlp.cv_results_
best_mlp_model = grid_search_mlp.best_estimator_

# TODO: delete if made in next cell
# print("Mejor modelo:", best_mlp_model)
# print("cv_results:", cv_results_mlp)
# print("Mejores hiperparámetros:", grid_search_mlp.best_params_)



Fitting 5 folds for each of 64 candidates, totalling 320 fits


In [40]:
print("Mejor modelo:", best_mlp_model)
print("cv_results:", cv_results_mlp)
print("Mejores hiperparámetros:", grid_search_mlp.best_params_)

Mejor modelo: MLPClassifier(alpha=0.05, hidden_layer_sizes=(100, 50), max_iter=1000,
              random_state=42, solver='sgd')
cv_results: {'mean_fit_time': array([ 3.87970467,  0.75832758,  5.36677337,  0.87366238, 13.5084259 ,
        1.46685386, 13.79641633,  1.24746413,  3.78260603,  0.80066037,
        5.57100277,  0.82425303, 10.82211061,  1.36409712, 11.54525957,
        1.30511575,  3.73662014,  1.05971951,  4.75871143,  0.96078787,
       11.63526497,  1.4854435 , 13.37347474,  1.11229253,  5.39667487,
        0.93802409,  5.40499845,  0.91654973, 11.69485416,  1.28244987,
       10.12973876,  1.49001765,  1.47765269,  1.1428803 ,  2.89816647,
        1.18634567,  6.40322542,  1.09271183,  6.55280085,  1.30775933,
        2.87268982,  1.48162246,  4.23276134,  1.3857357 ,  5.0012578 ,
        3.06131229,  6.10698848,  2.60438833,  2.15548286,  1.13597717,
        3.72357225,  0.98819041,  5.21572671,  1.04584589,  6.44273448,
        0.98277192,  3.54171224,  2.05067172,  3

# Evaluación con Cross-Validation

In [41]:
# TODO: check si es realmente necesario. En teoría, ya se hizo en las celdas anteriores con el 'cv'

# from sklearn.model_selection import cross_val_score

# # Random Forest
# scores_rf = cross_val_score(best_rf_model, X_train, y_train, cv=5, scoring='accuracy')

# # Gradient Boosting
# scores_gb = cross_val_score(best_gb_model, X_train, y_train, cv=5, scoring='accuracy')

# # Árboles de decisión
# scores_dt = cross_val_score(best_dt_model, X_train, y_train, cv=5, scoring='accuracy')

# # Regresión logística
# scores_lr = cross_val_score(best_lr_model, X_train, y_train, cv=5, scoring='accuracy')

# # Redes neuronales
# scores_mlp = cross_val_score(best_mlp_model, X_train, y_train, cv=5, scoring='accuracy')

# print("Cross-Validation Accuracy Scores (RF):", np.mean(scores_rf))
# print("Cross-Validation Accuracy Scores (GB):", np.mean(scores_gb))
# print("Cross-Validation Accuracy Scores (DT):", np.mean(scores_dt))
# print("Cross-Validation Accuracy Scores (LR):", np.mean(scores_lr))
# print("Cross-Validation Accuracy Scores (MLP):", np.mean(scores_mlp))


# Kaggle

### Test con Gradient Boosting

In [46]:
import os
import matplotlib.pyplot as plt
from tqdm import tqdm
from skimage.transform import resize
from skimage.exposure import equalize_hist

suffix = '.pgm'
size = (20, 32)  # Ajusta el tamaño deseado para redimensionar las imágenes

# Verificar si estamos en el entorno de Kaggle
test_kaggle_path = 'input/obligatorio-mlia-2024/Test/Test'
test_kaggle = os.listdir(test_kaggle_path)
test_kaggle = [filename for filename in test_kaggle if filename.endswith(suffix)]

print(f'# Test Kaggle: {len(test_kaggle)}')

# Inicializar listas para IDs y las imágenes preprocesadas
kaggle_id = []
im_test_kaggle = []

# Leer y preprocesar imágenes del conjunto de test de Kaggle
for filename in tqdm(test_kaggle):
    key = filename.replace('test_', '').replace('.pgm', '')
    kaggle_id.append(key)
    path = os.path.join(test_kaggle_path, filename)
    with open(path, 'rb') as pgmf:
        image = plt.imread(pgmf)
        image = resize(image, size)
    im_test_kaggle.append(image)

# Normalizar las imágenes
im_test_kaggle_norm = [equalize_hist(im_test_kaggle[i]) for i in range(len(im_test_kaggle))]


# Test Kaggle: 7920


  0%|          | 0/7920 [00:00<?, ?it/s]

100%|██████████| 7920/7920 [00:01<00:00, 6031.89it/s]


In [47]:
import numpy as np

# Aplanar las imágenes para que sean compatibles con el modelo de Gradient Boosting
X_test_kaggle = np.array([img.flatten() for img in im_test_kaggle_norm])


In [48]:
y_test_kaggle = best_gb_model.predict(X_test_kaggle)

In [49]:
y_test_kaggle.shape

(7920,)

In [50]:
import pandas as pd

# Crear el DataFrame para la submission
submission_kaggle = pd.DataFrame({'id': kaggle_id, 'target_feature': y_test_kaggle})

# Exportar el DataFrame a un archivo CSV
submission_kaggle.to_csv('submission.csv', index=False)


: 