In [14]:
# Cargar datos y convertir a numéricos
train_df = pd.read_csv("train_df.csv")
test_df = pd.read_csv("test_df.csv")

# Eliminar columnas no numéricas y convertir
X_train = train_df.drop(columns=["paciente_id", "target"]).apply(pd.to_numeric, errors="coerce").dropna()
y_train = train_df.loc[X_train.index, "target"].values.reshape(-1, 1)
X_test = test_df.drop(columns=["paciente_id"]).apply(pd.to_numeric, errors="coerce").dropna()

# Normalizar
X_train_normalized = normalize(X_train.values.astype(np.float64))
X_test_normalized = normalize(X_test.values.astype(np.float64))

# Añadir bias
X_train_final = np.hstack([np.ones((X_train_normalized.shape[0], 1)), X_train_normalized])
X_test_final = np.hstack([np.ones((X_test_normalized.shape[0], 1)), X_test_normalized])

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = um.true_divide(
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = um.true_divide(


In [1]:
import numpy as np

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def compute_loss(y_true, y_pred):
    m = y_true.shape[0]
    epsilon = 1e-15
    y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
    return -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))

def train_logistic_regression(X, y, lr=0.01, epochs=1000):
    m, n = X.shape
    theta = np.zeros((n, 1))
    bias = 0
    loss_history = []

    for epoch in range(epochs):
        z = np.dot(X, theta) + bias
        y_hat = sigmoid(z)

        loss = compute_loss(y, y_hat)
        loss_history.append(loss)

        dz = y_hat - y
        dw = np.dot(X.T, dz) / m
        db = np.sum(dz) / m

        theta -= lr * dw
        bias -= lr * db

    return theta, bias, loss_history


In [2]:
def predict(X, theta, bias, threshold=0.5):
    probs = sigmoid(np.dot(X, theta) + bias)
    return (probs >= threshold).astype(int)

def compute_f1_score(y_true, y_pred):
    tp = np.sum((y_true == 1) & (y_pred == 1))
    fp = np.sum((y_true == 0) & (y_pred == 1))
    fn = np.sum((y_true == 1) & (y_pred == 0))

    if tp + fp == 0 or tp + fn == 0:
        return 0.0  # evitar división por cero

    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    if precision + recall == 0:
        return 0.0

    return 2 * (precision * recall) / (precision + recall)



In [3]:
import pandas as pd
import numpy as np

# Cargar tu dataframe (asegúrate de que esté en tu entorno)
train_df = pd.read_csv("train_df.csv")  # Reemplaza con la ruta a tu archivo CSV
df = train_df.copy()
df = df.drop(columns=["paciente_id"])         # 1. Eliminar ID
df["genero"] = df["genero"].map({"M": 1, "F": 0})  # 2. Codificar genero

X = df.drop(columns=["target"])
y = df["target"].values.reshape(-1, 1)

# 3. Estandarizar variables numéricas (menos 'genero')
numeric_cols = X.columns.tolist()
numeric_cols.remove("genero")

means = X[numeric_cols].mean()
stds = X[numeric_cols].std()
X[numeric_cols] = (X[numeric_cols] - means) / stds

X_final = X.values  # X_final vuelve a existir


In [4]:
def train_val_split(X, y, val_ratio=0.2, seed=42):
    np.random.seed(seed)
    m = X.shape[0]
    indices = np.random.permutation(m)
    val_size = int(m * val_ratio)
    
    val_idx = indices[:val_size]
    train_idx = indices[val_size:]
    
    X_train, y_train = X[train_idx], y[train_idx]
    X_val, y_val = X[val_idx], y[val_idx]
    
    return X_train, y_train, X_val, y_val


In [5]:
X_train, y_train, X_val, y_val = train_val_split(X_final, y)


In [6]:
theta, bias, losses = train_logistic_regression(X_train, y_train, lr=0.1, epochs=1000)


In [7]:
y_val_pred = predict(X_val, theta, bias)
f1_val = compute_f1_score(y_val.ravel(), y_val_pred.ravel())

print("F1-Score en Validación:", f1_val)


F1-Score en Validación: 0.4576271186440678


In [8]:
learning_rates = [0.01, 0.05, 0.1, 0.2]
epoch_list = [500, 1000, 2000]

best_f1 = 0
best_params = (None, None)

print("Probando combinaciones...\n")

for lr in learning_rates:
    for epochs in epoch_list:
        theta, bias, _ = train_logistic_regression(X_train, y_train, lr=lr, epochs=epochs)
        y_val_pred = predict(X_val, theta, bias)
        f1 = compute_f1_score(y_val.ravel(), y_val_pred.ravel())
        
        print(f"lr = {lr}, epochs = {epochs} => F1-score: {f1:.4f}")
        
        if f1 > best_f1:
            best_f1 = f1
            best_params = (lr, epochs)

print("\n✅ Mejor combinación:")
print(f"Learning Rate: {best_params[0]}")
print(f"Epochs: {best_params[1]}")
print(f"F1-Score: {best_f1:.4f}")


Probando combinaciones...

lr = 0.01, epochs = 500 => F1-score: 0.4636
lr = 0.01, epochs = 1000 => F1-score: 0.4584
lr = 0.01, epochs = 2000 => F1-score: 0.4571
lr = 0.05, epochs = 500 => F1-score: 0.4595
lr = 0.05, epochs = 1000 => F1-score: 0.4552
lr = 0.05, epochs = 2000 => F1-score: 0.4576
lr = 0.1, epochs = 500 => F1-score: 0.4552
lr = 0.1, epochs = 1000 => F1-score: 0.4576
lr = 0.1, epochs = 2000 => F1-score: 0.4610
lr = 0.2, epochs = 500 => F1-score: 0.4576
lr = 0.2, epochs = 1000 => F1-score: 0.4610
lr = 0.2, epochs = 2000 => F1-score: 0.4615

✅ Mejor combinación:
Learning Rate: 0.01
Epochs: 500
F1-Score: 0.4636


In [9]:
theta_final, bias_final, _ = train_logistic_regression(X_final, y, lr=0.01, epochs=500)


In [21]:
test_df = pd.read_csv(r"C:\Users\dfqo2\Desktop\ALC\Kaggle\Proyecto2\test_df.csv")
test_df = test_df.drop(columns=["paciente_id"])  # Eliminar ID
test_df["genero"] = test_df["genero"].map({"M": 1, "F": 0})  # Codificar genero


In [30]:
# Estandarizar variables numéricas (menos 'genero')
print("Forma de test_df:", test_df.shape)  # Debe tener filas > 0
print("Columnas de test_df:", test_df.columns)

# Verificar dimensiones
print("Forma de X_test_final:", X_test_final.shape)  # Debe ser (n_filas, 15)
print("Forma de theta_final:", theta_final.shape)    # Debe ser (15, 1) o (16, 1) si hay bias

# Añadir columna de unos (bias)
X_test_final = np.hstack([np.ones((X_test_final.shape[0], 1)), X_test_final])

# Eliminar columna de unos (si no se usa bias)
X_test_final = X_test_final[:, 1:]


# Cargar test_df con verificación explícita
try:
    test_df = pd.read_csv("test_df.csv")
    print("Filas en test_df:", test_df.shape[0])
    if test_df.empty:
        raise ValueError("El archivo test_df.csv está vacío.")
except FileNotFoundError:
    raise FileNotFoundError("El archivo test_df.csv no existe.")



# Normalizar test_df
# Drop the 'target' column only if it exists
if 'target' in test_df.columns:
    X_test = test_df.drop(columns=["target"]).apply(pd.to_numeric, errors="coerce").dropna()
else:
    # Ensure that numeric conversion does not result in an empty dataset
    X_test = test_df.apply(pd.to_numeric, errors="coerce")
    if X_test.isnull().all(axis=None):
        raise ValueError("Todos los valores en test_df son NaN después de la conversión a numéricos.")
    X_test = X_test.dropna()
X_test = (X_test - means) / stds  # Normalizar usando los mismos parámetros que X_train
X_test = X_test.values.astype(np.float64)


# Verificar que X_test no está vacío
if X_test.shape[0] == 0:
    print("Advertencia: X_test está vacío después del preprocesamiento. Intentando manejar datos faltantes...")
    print("Mostrando las primeras filas de test_df para depuración:")
    print(test_df.head())
    
    # Intentar manejar datos faltantes rellenando con la media de las columnas
    X_test = test_df[numeric_cols].fillna(means).apply(pd.to_numeric, errors="coerce")
    X_test = (X_test - means) / stds  # Normalizar usando los mismos parámetros que X_train
    X_test = X_test.values.astype(np.float64)
    
    # Verificar nuevamente si X_test sigue vacío
    if X_test.shape[0] == 0:
        raise ValueError("X_test sigue vacío después de intentar manejar datos faltantes. Verifica los datos de entrada.")

# Añadir columna de unos (bias)
X_test_final = np.hstack([np.ones((X_test.shape[0], 1)), X_test])

# Verificar dimensiones
print("Forma de X_test_final:", X_test_final.shape)  # Debe ser (n, 16)
print("Forma de theta_final:", theta_final.shape)    # Debe ser (16, 1)

# Realizar predicción
y_test_pred = np.dot(X_test_final, theta_final) + bias_final

Forma de test_df: (10500, 15)
Columnas de test_df: Index(['paciente_id', 'ratio_colesterol', 'actividad_fisica',
       'presion_arterial', 'nivel_glucosa', 'indice_masa_corporal',
       'horas_sueno', 'historial_diabetes', 'frecuencia_cardiaca',
       'proteina_c_reactiva', 'dias_ultima_consulta', 'consumo_alcohol',
       'edad', 'genero', 'nivel_estres'],
      dtype='object')
Forma de X_test_final: (0, 16)
Forma de theta_final: (14, 1)
Filas en test_df: 10500
Advertencia: X_test está vacío después del preprocesamiento. Intentando manejar datos faltantes...
Mostrando las primeras filas de test_df para depuración:
  paciente_id  ratio_colesterol  actividad_fisica  presion_arterial  \
0   PAC_05946        123.592241          6.944192        110.218667   
1   PAC_12392        146.889149         19.271328        148.504026   
2   PAC_12788        179.236836         14.944609        138.368598   
3   PAC_08432        158.491789         18.881072        105.230178   
4   PAC_08189      