In [1]:
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim
import torch.nn as nn
import logging
import polars as pl
from utils2 import *

# Configuración de logging
logger = configure_logging()

# Directorio para guardar los checkpoints
checkpoint_dir = create_checkpoint_dir()


In [2]:
# Ruta del archivo de datos
data_filepath = "data/diabetic_data.csv"

# Columnas categóricas a convertir
categorical_cols = [
    'race', 'gender', 'age', 'payer_code', 'medical_specialty', 
    'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 
    'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 
    'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 
    'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 
    'tolazamide', 'examide', 'citoglipton', 'insulin', 
    'glyburide-metformin', 'glipizide-metformin', 
    'glimepiride-pioglitazone', 'metformin-rosiglitazone', 
    'metformin-pioglitazone', 'change', 'diabetesMed', 
    'readmitted'
]

In [3]:
# Preprocesamiento de datos
df = preprocess_data(data_filepath, categorical_cols)

2024-06-27 08:28:28,931 - INFO - Loading data
2024-06-27 08:28:28,979 - INFO - Handling missing values
2024-06-27 08:28:28,985 - INFO - Converting categorical columns to numerical


In [4]:
# Selección de características y objetivo
logger.info("Selecting features and target")
features = df.select([
    'encounter_id', 'patient_nbr', 'race', 'gender', 'age', 
    'admission_type_id', 'discharge_disposition_id', 'admission_source_id', 
    'time_in_hospital', 'num_lab_procedures', 'num_procedures', 
    'num_medications'
])
target = df.select('readmitted')

2024-06-27 08:28:29,078 - INFO - Selecting features and target


In [5]:
# Convertir a tensores de PyTorch
logger.info("Converting data to PyTorch tensors")
X_tensor = torch.tensor(features.to_numpy(), dtype=torch.float32)
y_tensor = torch.tensor(target.to_numpy(), dtype=torch.float32).view(-1)

# Convertir a arrays de NumPy
X_np = features.to_numpy()
y_np = target.to_numpy().flatten()

# Verificar las dimensiones y los valores de los tensores y arrays
logger.info(f"X_tensor shape: {X_tensor.shape}, y_tensor shape: {y_tensor.shape}")
logger.info(f"Sample y_tensor values: {y_tensor.unique()}")
logger.info(f"X_np shape: {X_np.shape}, y_np shape: {y_np.shape}")

2024-06-27 08:28:29,086 - INFO - Converting data to PyTorch tensors
2024-06-27 08:28:29,095 - INFO - X_tensor shape: torch.Size([101766, 12]), y_tensor shape: torch.Size([101766])
2024-06-27 08:28:29,102 - INFO - Sample y_tensor values: tensor([0., 1., 2.])
2024-06-27 08:28:29,102 - INFO - X_np shape: (101766, 12), y_np shape: (101766,)


In [6]:
# Crear un DataLoader
logger.info("Creating DataLoader")
dataset = TensorDataset(X_tensor, y_tensor)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

2024-06-27 08:28:29,108 - INFO - Creating DataLoader


# MNN (Pytorch)

In [7]:
# Definir el modelo
input_dim = X_tensor.shape[1]
output_dim = 3  # Para 3 clases
model = SimpleNN(input_dim, output_dim)

# Entrenar el modelo en GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger.info(f"Using device: {device}")

model = model.to(device)
criterion = nn.CrossEntropyLoss()  # Cambiado a CrossEntropyLoss para multiclase
optimizer = optim.Adam(model.parameters(), lr=0.001)


2024-06-27 08:28:29,137 - INFO - Using device: cuda


In [8]:
# Entrenamiento del modelo
num_epochs = 10
logger.info("Starting training")

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0.0
    
    for inputs, labels in dataloader:
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        
        # Verificación de tipos y dimensiones
        #print(f'outputs shape: {outputs.shape}, labels shape: {labels.shape}')
        #print(f'outputs dtype: {outputs.dtype}, labels dtype: {labels.dtype}')
        
        # Asegurarse de que las etiquetas sean del tipo correcto
        labels = labels.long()
        
        loss = criterion(outputs, labels)  # CrossEntropyLoss espera que las etiquetas sean torch.long
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
    
    avg_epoch_loss = epoch_loss / len(dataloader)
    logger.info(f'Epoch {epoch+1}/{num_epochs}, Loss: {avg_epoch_loss:.4f}')

    # Guardar checkpoint al final de cada epoch
    save_checkpoint(epoch, model, optimizer, avg_epoch_loss, checkpoint_dir)

logger.info("Training completed.")


2024-06-27 08:28:30,857 - INFO - Starting training
2024-06-27 08:28:37,871 - INFO - Epoch 1/10, Loss: 102822.1074
2024-06-27 08:28:37,877 - INFO - Checkpoint saved: checkpoints\checkpoint_epoch_1.pth
2024-06-27 08:28:44,934 - INFO - Epoch 2/10, Loss: 9179.1247
2024-06-27 08:28:44,939 - INFO - Checkpoint saved: checkpoints\checkpoint_epoch_2.pth
2024-06-27 08:28:52,041 - INFO - Epoch 3/10, Loss: 0.9466
2024-06-27 08:28:52,047 - INFO - Checkpoint saved: checkpoints\checkpoint_epoch_3.pth
2024-06-27 08:28:59,092 - INFO - Epoch 4/10, Loss: 0.9452
2024-06-27 08:28:59,097 - INFO - Checkpoint saved: checkpoints\checkpoint_epoch_4.pth
2024-06-27 08:29:06,310 - INFO - Epoch 5/10, Loss: 0.9452
2024-06-27 08:29:06,314 - INFO - Checkpoint saved: checkpoints\checkpoint_epoch_5.pth
2024-06-27 08:29:13,283 - INFO - Epoch 6/10, Loss: 0.9453
2024-06-27 08:29:13,288 - INFO - Checkpoint saved: checkpoints\checkpoint_epoch_6.pth
2024-06-27 08:29:20,486 - INFO - Epoch 7/10, Loss: 0.9453
2024-06-27 08:29:20

In [9]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Evaluación del modelo
logger.info("Evaluating the model")
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for inputs, labels in dataloader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        _, preds = torch.max(outputs, 1)  # Obtener el índice de la clase con mayor probabilidad
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Calcular las métricas
accuracy_torch = accuracy_score(all_labels, all_preds)
precision_torch = precision_score(all_labels, all_preds, average='weighted', zero_division=1)
recall_torch = recall_score(all_labels, all_preds, average='weighted')
f1_torch = f1_score(all_labels, all_preds, average='weighted')

logger.info(f"Model Accuracy: {accuracy_torch}")
logger.info(f"Model Precision: {precision_torch}")
logger.info(f"Model Recall: {recall_torch}")
logger.info(f"Model F1 Score: {f1_torch}")

2024-06-27 08:29:41,704 - INFO - Evaluating the model
2024-06-27 08:29:44,799 - INFO - Model Accuracy: 0.5391191557101586
2024-06-27 08:29:44,800 - INFO - Model Precision: 0.7515303083434756
2024-06-27 08:29:44,801 - INFO - Model Recall: 0.5391191557101586
2024-06-27 08:29:44,802 - INFO - Model F1 Score: 0.37768286227264436


# CNN

In [10]:
# Preparar los datos y el modelo
input_dim = X_tensor.shape[1]
output_dim = 3  # Para 3 clases
model = CNN(input_dim, output_dim).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Crear DataLoader
dataset = TensorDataset(X_tensor, y_tensor)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [11]:
# Entrenamiento del modelo
num_epochs = 10
logger.info("Starting training with CNN")

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0.0
    
    for inputs, labels in dataloader:
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        
        # Verificación de tipos y dimensiones
        #print(f'outputs shape: {outputs.shape}, labels shape: {labels.shape}')
        #print(f'outputs dtype: {outputs.dtype}, labels dtype: {labels.dtype}')
        
        # Asegurarse de que las etiquetas sean del tipo correcto
        labels = labels.long()
        
        loss = criterion(outputs, labels)  # CrossEntropyLoss espera que las etiquetas sean torch.long
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
    
    avg_epoch_loss = epoch_loss / len(dataloader)
    logger.info(f'Epoch {epoch+1}/{num_epochs}, Loss: {avg_epoch_loss:.4f}')

    # Guardar checkpoint al final de cada epoch
    save_checkpoint(epoch, model, optimizer, avg_epoch_loss, checkpoint_dir)

logger.info("Training completed.")


2024-06-27 08:29:44,823 - INFO - Starting training with CNN
2024-06-27 08:29:54,674 - INFO - Epoch 1/10, Loss: 17923.1220
2024-06-27 08:29:54,680 - INFO - Checkpoint saved: checkpoints\checkpoint_epoch_1.pth
2024-06-27 08:30:04,481 - INFO - Epoch 2/10, Loss: 200.8769
2024-06-27 08:30:04,486 - INFO - Checkpoint saved: checkpoints\checkpoint_epoch_2.pth
2024-06-27 08:30:14,302 - INFO - Epoch 3/10, Loss: 0.9366
2024-06-27 08:30:14,309 - INFO - Checkpoint saved: checkpoints\checkpoint_epoch_3.pth
2024-06-27 08:30:24,074 - INFO - Epoch 4/10, Loss: 0.9338
2024-06-27 08:30:24,081 - INFO - Checkpoint saved: checkpoints\checkpoint_epoch_4.pth
2024-06-27 08:30:33,905 - INFO - Epoch 5/10, Loss: 1.1471
2024-06-27 08:30:33,911 - INFO - Checkpoint saved: checkpoints\checkpoint_epoch_5.pth
2024-06-27 08:30:43,641 - INFO - Epoch 6/10, Loss: 1.0380
2024-06-27 08:30:43,648 - INFO - Checkpoint saved: checkpoints\checkpoint_epoch_6.pth
2024-06-27 08:30:53,424 - INFO - Epoch 7/10, Loss: 16.3760
2024-06-27 

In [12]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Evaluación del modelo
logger.info("Evaluating the CNN model")
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for inputs, labels in dataloader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        _, preds = torch.max(outputs, 1)  # Obtener el índice de la clase con mayor probabilidad
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Calcular las métricas
accuracy_cnn = accuracy_score(all_labels, all_preds)
precision_cnn = precision_score(all_labels, all_preds, average='weighted', zero_division=1)
recall_cnn = recall_score(all_labels, all_preds, average='weighted')
f1_cnn = f1_score(all_labels, all_preds, average='weighted')

logger.info(f"CNN Accuracy: {accuracy_cnn}")
logger.info(f"CNN Precision: {precision_cnn}")
logger.info(f"CNN Recall: {recall_cnn}")
logger.info(f"CNN F1 Score: {f1_cnn}")

2024-06-27 08:31:23,540 - INFO - Evaluating the CNN model
2024-06-27 08:31:27,613 - INFO - CNN Accuracy: 0.5399544052040957
2024-06-27 08:31:27,613 - INFO - CNN Precision: 0.5635968739096631
2024-06-27 08:31:27,614 - INFO - CNN Recall: 0.5399544052040957
2024-06-27 08:31:27,615 - INFO - CNN F1 Score: 0.4014093274978078


# RNN

In [13]:
# Asegurarse de que X_tensor tenga una dimensión de secuencia de longitud 1
# Crear una copia de X_tensor para el RNN
X_tensor_rnn = X_tensor.clone().unsqueeze(1)

# Preparar los datos y el modelo
input_dim = X_tensor_rnn.shape[2]
hidden_dim = 128
output_dim = 3  # Para 3 clases
num_layers = 2
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = RNN(input_dim, hidden_dim, output_dim, num_layers, device).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Crear DataLoader
dataset = TensorDataset(X_tensor_rnn, y_tensor)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [14]:
# Entrenamiento del modelo
num_epochs = 10
logger.info("Starting training with RNN")

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0.0
    
    for inputs, labels in dataloader:
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        
        # Verificación de tipos y dimensiones
        #print(f'outputs shape: {outputs.shape}, labels shape: {labels.shape}')
        #print(f'outputs dtype: {outputs.dtype}, labels dtype: {labels.dtype}')
        
        # Asegurarse de que las etiquetas sean del tipo correcto
        labels = labels.long()
        
        loss = criterion(outputs, labels)  # CrossEntropyLoss espera que las etiquetas sean torch.long
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
    
    avg_epoch_loss = epoch_loss / len(dataloader)
    logger.info(f'Epoch {epoch+1}/{num_epochs}, Loss: {avg_epoch_loss:.4f}')

    # Guardar checkpoint al final de cada epoch
    save_checkpoint(epoch, model, optimizer, avg_epoch_loss, checkpoint_dir)

logger.info("Training completed.")

2024-06-27 08:31:27,644 - INFO - Starting training with RNN
2024-06-27 08:31:39,665 - INFO - Epoch 1/10, Loss: 0.9392
2024-06-27 08:31:39,675 - INFO - Checkpoint saved: checkpoints\checkpoint_epoch_1.pth
2024-06-27 08:31:51,392 - INFO - Epoch 2/10, Loss: 0.9379
2024-06-27 08:31:51,401 - INFO - Checkpoint saved: checkpoints\checkpoint_epoch_2.pth
2024-06-27 08:32:03,115 - INFO - Epoch 3/10, Loss: 0.9377
2024-06-27 08:32:03,125 - INFO - Checkpoint saved: checkpoints\checkpoint_epoch_3.pth
2024-06-27 08:32:14,825 - INFO - Epoch 4/10, Loss: 0.9373
2024-06-27 08:32:14,834 - INFO - Checkpoint saved: checkpoints\checkpoint_epoch_4.pth
2024-06-27 08:32:26,651 - INFO - Epoch 5/10, Loss: 0.9368
2024-06-27 08:32:26,659 - INFO - Checkpoint saved: checkpoints\checkpoint_epoch_5.pth
2024-06-27 08:32:38,511 - INFO - Epoch 6/10, Loss: 0.9367
2024-06-27 08:32:38,521 - INFO - Checkpoint saved: checkpoints\checkpoint_epoch_6.pth
2024-06-27 08:32:50,147 - INFO - Epoch 7/10, Loss: 0.9366
2024-06-27 08:32:5

In [15]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Evaluación del modelo
logger.info("Evaluating the RNN model")
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for inputs, labels in dataloader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        _, preds = torch.max(outputs, 1)  # Obtener el índice de la clase con mayor probabilidad
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Calcular las métricas
accuracy_rnn = accuracy_score(all_labels, all_preds)
precision_rnn = precision_score(all_labels, all_preds, average='weighted', zero_division=1)
recall_rnn = recall_score(all_labels, all_preds, average='weighted')
f1_rnn = f1_score(all_labels, all_preds, average='weighted')

logger.info(f"RNN Accuracy: {accuracy_rnn}")
logger.info(f"RNN Precision: {precision_rnn}")
logger.info(f"RNN Recall: {recall_rnn}")
logger.info(f"RNN F1 Score: {f1_rnn}")

2024-06-27 08:33:25,769 - INFO - Evaluating the RNN model
2024-06-27 08:33:30,344 - INFO - RNN Accuracy: 0.5431774856042293
2024-06-27 08:33:30,345 - INFO - RNN Precision: 0.5699439045010093
2024-06-27 08:33:30,345 - INFO - RNN Recall: 0.5431774856042293
2024-06-27 08:33:30,346 - INFO - RNN F1 Score: 0.4417652466289006


# Regresion Logistica

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X_np, y_np, test_size=0.2, random_state=42)

# Crear y entrenar el modelo de regresión logística
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)

# Hacer predicciones y evaluar el modelo
y_pred = log_reg.predict(X_test)

# Calcular las métricas
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=1)
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

logger.info(f"Logistic Regression Accuracy: {accuracy}")
logger.info(f"Logistic Regression Precision: {precision}")
logger.info(f"Logistic Regression Recall: {recall}")
logger.info(f"Logistic Regression F1 Score: {f1}")

2024-06-27 08:33:30,826 - INFO - Logistic Regression Accuracy: 0.5383217058072124
2024-06-27 08:33:30,827 - INFO - Logistic Regression Precision: 0.5620881913822836
2024-06-27 08:33:30,827 - INFO - Logistic Regression Recall: 0.5383217058072124
2024-06-27 08:33:30,828 - INFO - Logistic Regression F1 Score: 0.44532293893748903


# Random Forest

In [17]:
from sklearn.ensemble import RandomForestClassifier

# Crear y entrenar el modelo de bosques aleatorios
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Hacer predicciones y evaluar el modelo
y_pred_rf = rf.predict(X_test)

# Calcular las métricas
accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf, average='weighted', zero_division=1)
recall_rf = recall_score(y_test, y_pred_rf, average='weighted')
f1_rf = f1_score(y_test, y_pred_rf, average='weighted')

logger.info(f"Random Forest Accuracy: {accuracy_rf}")
logger.info(f"Random Forest Precision: {precision_rf}")
logger.info(f"Random Forest Recall: {recall_rf}")
logger.info(f"Random Forest F1 Score: {f1_rf}")

2024-06-27 08:33:49,159 - INFO - Random Forest Accuracy: 0.5703055910386164
2024-06-27 08:33:49,160 - INFO - Random Forest Precision: 0.5416231091212172
2024-06-27 08:33:49,162 - INFO - Random Forest Recall: 0.5703055910386164
2024-06-27 08:33:49,163 - INFO - Random Forest F1 Score: 0.5313892426143688


# Support Vector Machine (SVM)

In [20]:
# Definir el modelo
input_dim = X_tensor.shape[1]
output_dim = 3  # Para 3 clases
model = SVM(input_dim)

# Entrenar el modelo en GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger.info(f"Using device: {device}")

model = model.to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)


2024-06-27 08:36:32,851 - INFO - Using device: cuda


In [23]:
# Entrenamiento del modelo
num_epochs = 10
logger.info("Starting training with SVM")

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0.0
    
    for inputs, labels in dataloader:
        inputs, labels = inputs.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(inputs)
        
        # Asegurarse de que las salidas tengan el tamaño correcto
        outputs = outputs.squeeze(1)
        
        # Verificación de tipos y dimensiones
        #print(f'outputs shape: {outputs.shape}, labels shape: {labels.shape}')
        #print(f'outputs dtype: {outputs.dtype}, labels dtype: {labels.dtype}')
        
        # Asegurarse de que las etiquetas sean del tipo correcto y tengan el tamaño correcto
        labels = labels.squeeze().long()
        
        loss = criterion(outputs, labels)  # CrossEntropyLoss espera que las etiquetas sean torch.long
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
    
    avg_epoch_loss = epoch_loss / len(dataloader)
    logger.info(f'Epoch {epoch+1}/{num_epochs}, Loss: {avg_epoch_loss:.4f}')

    # Guardar checkpoint al final de cada epoch
    save_checkpoint(epoch, model, optimizer, avg_epoch_loss, checkpoint_dir)

logger.info("Training completed.")


2024-06-27 08:38:53,971 - INFO - Starting training with SVM
2024-06-27 08:38:58,497 - INFO - Epoch 1/10, Loss: 83059345525015.0312
2024-06-27 08:38:58,500 - INFO - Checkpoint saved: checkpoints\checkpoint_epoch_1.pth
2024-06-27 08:39:03,063 - INFO - Epoch 2/10, Loss: 81849635417783.6562
2024-06-27 08:39:03,065 - INFO - Checkpoint saved: checkpoints\checkpoint_epoch_2.pth
2024-06-27 08:39:07,505 - INFO - Epoch 3/10, Loss: 81515527819056.6875
2024-06-27 08:39:07,507 - INFO - Checkpoint saved: checkpoints\checkpoint_epoch_3.pth
2024-06-27 08:39:11,862 - INFO - Epoch 4/10, Loss: 82220631610001.9844
2024-06-27 08:39:11,864 - INFO - Checkpoint saved: checkpoints\checkpoint_epoch_4.pth
2024-06-27 08:39:16,409 - INFO - Epoch 5/10, Loss: 82259972548499.1875
2024-06-27 08:39:16,411 - INFO - Checkpoint saved: checkpoints\checkpoint_epoch_5.pth
2024-06-27 08:39:20,809 - INFO - Epoch 6/10, Loss: 83299440152492.2969
2024-06-27 08:39:20,811 - INFO - Checkpoint saved: checkpoints\checkpoint_epoch_6.pt

In [25]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Evaluación del modelo
logger.info("Evaluating the SVM model")
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for inputs, labels in dataloader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        outputs = outputs.squeeze(1)
        _, preds = torch.max(outputs, 1)  # Obtener el índice de la clase con mayor probabilidad
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Convertir las listas a arrays de NumPy para asegurar compatibilidad con sklearn
all_preds = np.array(all_preds)
all_labels = np.array(all_labels)

# Calcular las métricas
accuracy_svm = accuracy_score(all_labels, all_preds)
precision_svm = precision_score(all_labels, all_preds, average='weighted', zero_division=1)
recall_svm = recall_score(all_labels, all_preds, average='weighted')
f1_svm = f1_score(all_labels, all_preds, average='weighted')

logger.info(f"SVM Accuracy: {accuracy_svm}")
logger.info(f"SVM Precision: {precision_svm}")
logger.info(f"SVM Recall: {recall_svm}")
logger.info(f"SVM F1 Score: {f1_svm}")

2024-06-27 08:42:37,738 - INFO - Evaluating the SVM model
2024-06-27 08:42:40,217 - INFO - SVM Accuracy: 0.5391191557101586
2024-06-27 08:42:40,218 - INFO - SVM Precision: 0.7515303083434756
2024-06-27 08:42:40,218 - INFO - SVM Recall: 0.5391191557101586
2024-06-27 08:42:40,220 - INFO - SVM F1 Score: 0.37768286227264436


# K-Nearest Neighbor

In [26]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Crear y entrenar el modelo de K-Vecinos más Cercanos
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

# Hacer predicciones y evaluar el modelo
y_pred = knn.predict(X_test)

# Calcular las métricas
accuracy_knn = accuracy_score(y_test, y_pred)
precision_knn = precision_score(y_test, y_pred, average='weighted', zero_division=1)
recall_knn = recall_score(y_test, y_pred, average='weighted')
f1_knn = f1_score(y_test, y_pred, average='weighted')

logger.info(f"K-Nearest Neighbors Accuracy: {accuracy_knn}")
logger.info(f"K-Nearest Neighbors Precision: {precision_knn}")
logger.info(f"K-Nearest Neighbors Recall: {recall_knn}")
logger.info(f"K-Nearest Neighbors F1 Score: {f1_knn}")

2024-06-27 08:45:08,726 - INFO - K-Nearest Neighbors Accuracy: 0.5156725950673087
2024-06-27 08:45:08,728 - INFO - K-Nearest Neighbors Precision: 0.47017378390553505
2024-06-27 08:45:08,729 - INFO - K-Nearest Neighbors Recall: 0.5156725950673087
2024-06-27 08:45:08,730 - INFO - K-Nearest Neighbors F1 Score: 0.48191514846137073


# Gradient Boosting

In [27]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Crear y entrenar el modelo de Gradient Boosting
gb = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb.fit(X_train, y_train)

# Hacer predicciones y evaluar el modelo
y_pred = gb.predict(X_test)

# Calcular las métricas
accuracy_gb = accuracy_score(y_test, y_pred)
precision_gb = precision_score(y_test, y_pred, average='weighted', zero_division=1)
recall_gb = recall_score(y_test, y_pred, average='weighted')
f1_gb = f1_score(y_test, y_pred, average='weighted')

logger.info(f"Gradient Boosting Accuracy: {accuracy_gb}")
logger.info(f"Gradient Boosting Precision: {precision_gb}")
logger.info(f"Gradient Boosting Recall: {recall_gb}")
logger.info(f"Gradient Boosting F1 Score: {f1_gb}")

2024-06-27 08:46:07,464 - INFO - Gradient Boosting Accuracy: 0.5738429792669746
2024-06-27 08:46:07,465 - INFO - Gradient Boosting Precision: 0.575850997583986
2024-06-27 08:46:07,466 - INFO - Gradient Boosting Recall: 0.5738429792669746
2024-06-27 08:46:07,466 - INFO - Gradient Boosting F1 Score: 0.5252324064242889


# Red Neuronal Multicapa

In [28]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Crear y entrenar el modelo de Red Neuronal Multicapa
mlp = MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=1000, random_state=42)
mlp.fit(X_train, y_train)

# Hacer predicciones y evaluar el modelo
y_pred = mlp.predict(X_test)

# Calcular las métricas
accuracy_mlp = accuracy_score(y_test, y_pred)
precision_mlp = precision_score(y_test, y_pred, average='weighted', zero_division=1)
recall_mlp = recall_score(y_test, y_pred, average='weighted')
f1_mlp = f1_score(y_test, y_pred, average='weighted')

logger.info(f"MLP Accuracy: {accuracy_mlp}")
logger.info(f"MLP Precision: {precision_mlp}")
logger.info(f"MLP Recall: {recall_mlp}")
logger.info(f"MLP F1 Score: {f1_mlp}")

2024-06-27 08:48:47,560 - INFO - MLP Accuracy: 0.5380760538469097
2024-06-27 08:48:47,561 - INFO - MLP Precision: 0.7514497858765528
2024-06-27 08:48:47,563 - INFO - MLP Recall: 0.5380760538469097
2024-06-27 08:48:47,563 - INFO - MLP F1 Score: 0.376477923831301


# Arbol de Decision

In [29]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Crear y entrenar el modelo de Árbol de Decisión
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)

# Hacer predicciones y evaluar el modelo
y_pred = dt.predict(X_test)

# Calcular las métricas
accuracy_dt = accuracy_score(y_test, y_pred)
precision_dt = precision_score(y_test, y_pred, average='weighted', zero_division=1)
recall_dt = recall_score(y_test, y_pred, average='weighted')
f1_dt = f1_score(y_test, y_pred, average='weighted')

logger.info(f"Decision Tree Accuracy: {accuracy_dt}")
logger.info(f"Decision Tree Precision: {precision_dt}")
logger.info(f"Decision Tree Recall: {recall_dt}")
logger.info(f"Decision Tree F1 Score: {f1_dt}")

2024-06-27 08:49:54,863 - INFO - Decision Tree Accuracy: 0.4713569814287118
2024-06-27 08:49:54,864 - INFO - Decision Tree Precision: 0.477029116876877
2024-06-27 08:49:54,864 - INFO - Decision Tree Recall: 0.4713569814287118
2024-06-27 08:49:54,866 - INFO - Decision Tree F1 Score: 0.4741007214725254


# Seleccion del Mejor Modelo

In [31]:
# Suponiendo que tenemos las métricas de varios modelos
results = {
    "Model": ["Logistic Regression", "Random Forest", "SVM (PyTorch)", "K-Nearest Neighbors", "Gradient Boosting", "MLP (scikit-learn)", "MLP (PyTorch)", "Decision Tree", "CNN (PyTorch)", "RNN (PyTorch)"],
    "Accuracy": [accuracy, accuracy_rf, accuracy_svm, accuracy_knn, accuracy_gb, accuracy_mlp, accuracy_torch, accuracy_dt, accuracy_cnn, accuracy_rnn],
    "Precision": [precision, precision_rf, precision_svm, precision_knn, precision_gb, precision_mlp, precision_torch, precision_dt, precision_cnn, precision_rnn],
    "Recall": [recall, recall_rf, recall_svm, recall_knn, recall_gb, recall_mlp, recall_torch, recall_dt, recall_cnn, recall_rnn],
    "F1 Score": [f1, f1_rf, f1_svm, f1_knn, f1_gb, f1_mlp, f1_torch, f1_dt, f1_cnn, f1_rnn]
}

import pandas as pd

results_df = pd.DataFrame(results)
print(results_df)

# Seleccionar el mejor modelo basado en una combinación de métricas
best_model_index = results_df[['Accuracy', 'Precision', 'Recall', 'F1 Score']].mean(axis=1).idxmax()
print(f"The best model based on the combination of metrics is: {results_df.iloc[best_model_index]['Model']}")

# Mostrar todos los resultados
print("Model performance comparison:")
print(results_df)


                 Model  Accuracy  Precision    Recall  F1 Score
0  Logistic Regression  0.538322   0.562088  0.538322  0.445323
1        Random Forest  0.570306   0.541623  0.570306  0.531389
2        SVM (PyTorch)  0.539119   0.751530  0.539119  0.377683
3  K-Nearest Neighbors  0.515673   0.470174  0.515673  0.481915
4    Gradient Boosting  0.573843   0.575851  0.573843  0.525232
5   MLP (scikit-learn)  0.538076   0.751450  0.538076  0.376478
6        MLP (PyTorch)  0.539119   0.751530  0.539119  0.377683
7        Decision Tree  0.471357   0.477029  0.471357  0.474101
8        CNN (PyTorch)  0.539954   0.563597  0.539954  0.401409
9        RNN (PyTorch)  0.543177   0.569944  0.543177  0.441765
The best model based on the combination of metrics is: Gradient Boosting
Model performance comparison:
                 Model  Accuracy  Precision    Recall  F1 Score
0  Logistic Regression  0.538322   0.562088  0.538322  0.445323
1        Random Forest  0.570306   0.541623  0.570306  0.531389
2