In [14]:
# Cargar datos y convertir a numéricos
train_df = pd.read_csv("train_df.csv")
test_df = pd.read_csv("test_df.csv")

# Eliminar columnas no numéricas y convertir
X_train = train_df.drop(columns=["paciente_id", "target"]).apply(pd.to_numeric, errors="coerce").dropna()
y_train = train_df.loc[X_train.index, "target"].values.reshape(-1, 1)
X_test = test_df.drop(columns=["paciente_id"]).apply(pd.to_numeric, errors="coerce").dropna()

# Normalizar
X_train_normalized = normalize(X_train.values.astype(np.float64))
X_test_normalized = normalize(X_test.values.astype(np.float64))

# Añadir bias
X_train_final = np.hstack([np.ones((X_train_normalized.shape[0], 1)), X_train_normalized])
X_test_final = np.hstack([np.ones((X_test_normalized.shape[0], 1)), X_test_normalized])

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = um.true_divide(
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = um.true_divide(


In [1]:
import numpy as np

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def compute_loss(y_true, y_pred):
    m = y_true.shape[0]
    epsilon = 1e-15
    y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
    return -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))

def train_logistic_regression(X, y, lr=0.01, epochs=1000):
    m, n = X.shape
    theta = np.zeros((n, 1))
    bias = 0
    loss_history = []

    for epoch in range(epochs):
        z = np.dot(X, theta) + bias
        y_hat = sigmoid(z)

        loss = compute_loss(y, y_hat)
        loss_history.append(loss)

        dz = y_hat - y
        dw = np.dot(X.T, dz) / m
        db = np.sum(dz) / m

        theta -= lr * dw
        bias -= lr * db

    return theta, bias, loss_history


In [31]:
def predict(X, theta, bias, threshold=0.5):
    probs = sigmoid(np.dot(X, theta) + bias)
    return (probs >= threshold).astype(int)

def compute_f1_score(y_true, y_pred):
    tp = np.sum((y_true == 1) & (y_pred == 1))
    fp = np.sum((y_true == 0) & (y_pred == 1))
    fn = np.sum((y_true == 1) & (y_pred == 0))

    if tp + fp == 0 or tp + fn == 0:
        return 0.0  # evitar división por cero

    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    if precision + recall == 0:
        return 0.0

    return 2 * (precision * recall) / (precision + recall)

def normalize(X):
    norms = np.linalg.norm(X, axis=1, keepdims=True)
    # Evitar división por cero
    norms[norms == 0] = 1
    return X / norms



In [32]:
import pandas as pd
import numpy as np

# Cargar tu dataframe (asegúrate de que esté en tu entorno)
train_df = pd.read_csv("train_df.csv")  # Reemplaza con la ruta a tu archivo CSV
df = train_df.copy()
df = df.drop(columns=["paciente_id"])         # 1. Eliminar ID
df["genero"] = df["genero"].map({"M": 1, "F": 0})  # 2. Codificar genero

X = df.drop(columns=["target"])
y = df["target"].values.reshape(-1, 1)

# 3. Estandarizar variables numéricas (menos 'genero')
numeric_cols = X.columns.tolist()
numeric_cols.remove("genero")

means = X[numeric_cols].mean()
stds = X[numeric_cols].std()
X[numeric_cols] = (X[numeric_cols] - means) / stds

X_final = X.values  # X_final vuelve a existir


In [33]:
def train_val_split(X, y, val_ratio=0.2, seed=42):
    np.random.seed(seed)
    m = X.shape[0]
    indices = np.random.permutation(m)
    val_size = int(m * val_ratio)
    
    val_idx = indices[:val_size]
    train_idx = indices[val_size:]
    
    X_train, y_train = X[train_idx], y[train_idx]
    X_val, y_val = X[val_idx], y[val_idx]
    
    return X_train, y_train, X_val, y_val


In [49]:
X_train, y_train, X_val, y_val = train_val_split(X_final, y)
theta, bias, losses = train_logistic_regression(X_train, y_train, lr=0.1, epochs=1000)
y_val_pred = predict(X_val, theta, bias)
f1_val = compute_f1_score(y_val.ravel(), y_val_pred.ravel())

print("F1-Score en Validación:", f1_val)


F1-Score en Validación: 0.4576271186440678


In [8]:
learning_rates = [0.01, 0.05, 0.1, 0.2]
epoch_list = [500, 1000, 2000]

best_f1 = 0
best_params = (None, None)

print("Probando combinaciones...\n")

for lr in learning_rates:
    for epochs in epoch_list:
        theta, bias, _ = train_logistic_regression(X_train, y_train, lr=lr, epochs=epochs)
        y_val_pred = predict(X_val, theta, bias)
        f1 = compute_f1_score(y_val.ravel(), y_val_pred.ravel())
        
        print(f"lr = {lr}, epochs = {epochs} => F1-score: {f1:.4f}")
        
        if f1 > best_f1:
            best_f1 = f1
            best_params = (lr, epochs)

print("\n✅ Mejor combinación:")
print(f"Learning Rate: {best_params[0]}")
print(f"Epochs: {best_params[1]}")
print(f"F1-Score: {best_f1:.4f}")


Probando combinaciones...

lr = 0.01, epochs = 500 => F1-score: 0.4636
lr = 0.01, epochs = 1000 => F1-score: 0.4584
lr = 0.01, epochs = 2000 => F1-score: 0.4571
lr = 0.05, epochs = 500 => F1-score: 0.4595
lr = 0.05, epochs = 1000 => F1-score: 0.4552
lr = 0.05, epochs = 2000 => F1-score: 0.4576
lr = 0.1, epochs = 500 => F1-score: 0.4552
lr = 0.1, epochs = 1000 => F1-score: 0.4576
lr = 0.1, epochs = 2000 => F1-score: 0.4610
lr = 0.2, epochs = 500 => F1-score: 0.4576
lr = 0.2, epochs = 1000 => F1-score: 0.4610
lr = 0.2, epochs = 2000 => F1-score: 0.4615

✅ Mejor combinación:
Learning Rate: 0.01
Epochs: 500
F1-Score: 0.4636


In [None]:
theta_final, bias_final, _ = train_logistic_regression(X_final, y, lr=0.01, epochs=500)

test_df = pd.read_csv(r"C:\Users\dfqo2\Desktop\ALC\Kaggle\Proyecto2\test_df.csv")
test_df = test_df.drop(columns=["paciente_id"])  # Eliminar ID
test_df["genero"] = test_df["genero"].map({"M": 1, "F": 0})  # Codificar genero



In [None]:
print("test_df shape:", test_df.shape)
# Obtener las columnas que usaste en entrenamiento
train_columns = X_train.columns

# Dejar solo esas columnas en el test
X_test = test_df[train_columns].apply(pd.to_numeric, errors="coerce")

# Rellenar valores faltantes
X_test = X_test.fillna(X_test.mean())

# Normalizar
X_test_normalized = normalize(X_test.values.astype(np.float64))

# Añadir bias
X_test_final = np.hstack([np.ones((X_test_normalized.shape[0], 1)), X_test_normalized])

# Confirmar dimensiones
print("X_test_final shape:", X_test_final.shape)
print("theta shape:", theta.shape)




test_df shape: (10500, 15)
X_test_final shape: (10500, 15)
theta shape: (14, 1)


In [52]:
import pandas as pd
import numpy as np

# Cargar datos
train_df = pd.read_csv("train_df.csv")

# Preprocesamiento
df = train_df.copy()
df["genero"] = df["genero"].map({"M": 1, "F": 0})  # Codificar genero
paciente_ids = df["paciente_id"].values  # Guardar IDs
df = df.drop(columns=["paciente_id", "target"])

# Normalización
def normalize(X):
    norms = np.linalg.norm(X, axis=1, keepdims=True)
    norms[norms == 0] = 1
    return X / norms

# Estándarización numérica (excepto 'genero')
numeric_cols = df.columns.tolist()
numeric_cols.remove("genero")
means = df[numeric_cols].mean()
stds = df[numeric_cols].std()
df[numeric_cols] = (df[numeric_cols] - means) / stds

# Convertir a matriz
X = df.values.astype(np.float64)
X_normalized = normalize(X)

# NO añadimos bias manualmente, el modelo ya lo tiene separado
X_with_bias = X_normalized

# Hacer predicciones
y_pred_probs = sigmoid(np.dot(X_with_bias, theta_final) + bias_final)
y_pred_labels = (y_pred_probs >= 0.5).astype(int).flatten()

# Crear archivo de submission
submission_df = pd.DataFrame({
    "paciente_id": paciente_ids,
    "target": y_pred_labels
})

submission_df.to_csv("submission.csv", index=False)
print("✅ Archivo submission.csv guardado correctamente.")


✅ Archivo submission.csv guardado correctamente.
