# machine_learning.ipynb




In [2]:
# 1. Importación de librerías
import os
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.exceptions import ConvergenceWarning
import warnings
import joblib

# Ignorar warnings (opcional)
warnings.filterwarnings("ignore", category=ConvergenceWarning)

# 2. Carga de datos
BASE_PATH = "/workspaces/ejercicio_regresion_logistica/data/processed"

TRAIN_PATHS = [
    "X_train_con_outliers.xlsx",
    "X_train_sin_outliers.xlsx",
    "X_train_con_outliers_norm.xlsx",
    "X_train_sin_outliers_norm.xlsx",
    "X_train_con_outliers_scal.xlsx",
    "X_train_sin_outliers_scal.xlsx"
]
TRAIN_DATASETS = []
for path in TRAIN_PATHS:
    try:
        TRAIN_DATASETS.append(pd.read_excel(os.path.join(BASE_PATH, path)))
    except Exception as e:
        print(f"Error al cargar {path}: {e}")

TEST_PATHS = [
    "X_test_con_outliers.xlsx",
    "X_test_sin_outliers.xlsx",
    "X_test_con_outliers_norm.xlsx",
    "X_test_sin_outliers_norm.xlsx",
    "X_test_con_outliers_scal.xlsx",
    "X_test_sin_outliers_scal.xlsx"
]
TEST_DATASETS = []
for path in TEST_PATHS:
    try:
        TEST_DATASETS.append(pd.read_excel(os.path.join(BASE_PATH, path)))
    except Exception as e:
        print(f"Error al cargar {path}: {e}")

try:
    y_train = pd.read_excel(os.path.join(BASE_PATH, "y_train.xlsx")).values.ravel()
    y_test = pd.read_excel(os.path.join(BASE_PATH, "y_test.xlsx")).values.ravel()
except Exception as e:
    print(f"Error al cargar y_train o y_test: {e}")

# Verificación
print("Cantidad de datasets de TRAIN cargados:", len(TRAIN_DATASETS))
print("Cantidad de datasets de TEST cargados:", len(TEST_DATASETS))

# 3. Modelado y Evaluación
results = []
for index, dataset in enumerate(TRAIN_DATASETS):
    print(f"Entrenando modelo con dataset {index + 1}")
    model = LogisticRegression(random_state=42, max_iter=1000)  # Aumentar max_iter si es necesario
    model.fit(dataset, y_train)
    y_pred_train = model.predict(dataset)
    y_pred_test = model.predict(TEST_DATASETS[index])

    train_accuracy = accuracy_score(y_train, y_pred_train)
    test_accuracy = accuracy_score(y_test, y_pred_test)
    train_f1 = f1_score(y_train, y_pred_train)
    test_f1 = f1_score(y_test, y_pred_test)
    train_conf_matrix = confusion_matrix(y_train, y_pred_train)
    test_conf_matrix = confusion_matrix(y_test, y_pred_test)

    results.append({
        "train_accuracy": train_accuracy,
        "test_accuracy": test_accuracy,
        "train_f1": train_f1,
        "test_f1": test_f1,
        "train_conf_matrix": train_conf_matrix,
        "test_conf_matrix": test_conf_matrix
    })

# Mostrar resultados
for i, result in enumerate(results):
    print(f"Dataset {i + 1}:")
    print(f"  Train Accuracy: {result['train_accuracy']:.4f}, Test Accuracy: {result['test_accuracy']:.4f}")
    print(f"  Train F1 Score: {result['train_f1']:.4f}, Test F1 Score: {result['test_f1']:.4f}")
    print(f"  Train Confusion Matrix:\n{result['train_conf_matrix']}")
    print(f"  Test Confusion Matrix:\n{result['test_conf_matrix']}\n")

# 4. Optimización con GridSearchCV del mejor modelo
best_dataset = 2  # Ajustar según los resultados del paso anterior

hyperparams = {
    "C": [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    "penalty": ["l1", "l2", "elasticnet", None],
    "solver": ["newton-cg", "lbfgs", "liblinear", "sag", "saga"]
}

model = LogisticRegression(random_state=42)
grid = GridSearchCV(model, hyperparams, scoring="accuracy")
grid.fit(TRAIN_DATASETS[best_dataset], y_train)

final_model = grid.best_estimator_
y_pred_train = final_model.predict(TRAIN_DATASETS[best_dataset])
y_pred_test = final_model.predict(TEST_DATASETS[best_dataset])
print(y_pred_train)
print(y_pred_test)

# Guardar el modelo en un archivo .sav
MODEL_PATH = "/workspaces/ejercicio_regresion_logistica/models/"
MODEL_NAME = "logistic_regression_model.sav"
joblib.dump(final_model, f"{MODEL_PATH}{MODEL_NAME}")
print(f"Modelo guardado en {MODEL_PATH}{MODEL_NAME}")

Cantidad de datasets de TRAIN cargados: 6
Cantidad de datasets de TEST cargados: 6
Entrenando modelo con dataset 1
Entrenando modelo con dataset 2
Entrenando modelo con dataset 3
Entrenando modelo con dataset 4
Entrenando modelo con dataset 5
Entrenando modelo con dataset 6
Dataset 1:
  Train Accuracy: 0.9111, Test Accuracy: 0.9024
  Train F1 Score: 0.5033, Test F1 Score: 0.4766
  Train Confusion Matrix:
[[28527   745]
 [ 2184  1484]]
  Test Confusion Matrix:
[[7066  199]
 [ 605  366]]

Dataset 2:
  Train Accuracy: 0.9046, Test Accuracy: 0.8992
  Train F1 Score: 0.4580, Test F1 Score: 0.4430
  Train Confusion Matrix:
[[28469   803]
 [ 2340  1328]]
  Test Confusion Matrix:
[[7076  189]
 [ 641  330]]

Dataset 3:
  Train Accuracy: 0.9110, Test Accuracy: 0.9038
  Train F1 Score: 0.5075, Test F1 Score: 0.4877
  Train Confusion Matrix:
[[28499   773]
 [ 2158  1510]]
  Test Confusion Matrix:
[[7067  198]
 [ 594  377]]

Dataset 4:
  Train Accuracy: 0.9049, Test Accuracy: 0.8992
  Train F1 Scor



[0 0 0 ... 0 0 0]
[1 0 0 ... 0 0 0]
Modelo guardado en /workspaces/ejercicio_regresion_logistica/models/logistic_regression_model.sav


315 fits failed out of a total of 700.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
35 fits failed with the following error:
Traceback (most recent call last):
  File "/home/vscode/.local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/vscode/.local/lib/python3.10/site-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/home/vscode/.local/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py", line 1193, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/vscode/.local/lib/python3.10/site-packages/sklearn/linear_model/_logist