In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [4]:
# Cargar dataset
url = "https://github.com/ulewis/Ejemplos/raw/main/Datos/healthcare_dataset.csv"
df = pd.read_csv(url)

print("Columnas originales:", df.columns.tolist())

Columnas originales: ['Name', 'Age', 'Gender', 'Blood Type', 'Medical Condition', 'Date of Admission', 'Doctor', 'Hospital', 'Insurance Provider', 'Billing Amount', 'Room Number', 'Admission Type', 'Discharge Date', 'Medication', 'Test Results']


In [5]:
# Target: Billing Amount
target = "Billing Amount"

# Eliminar columnas obviamente irrelevantes (IDs, nombres, etc.)
cols_drop = [col for col in df.columns if "ID" in col or "Name" in col or "Address" in col]
df = df.drop(columns=cols_drop, errors="ignore")

In [6]:
# Eliminar filas con Billing Amount vacío
df = df.dropna(subset=[target])

# Separar features y target
X = df.drop(columns=[target])
y_cont = df[target]

# Discretizar Billing Amount en clases (ej: 3 clases: bajo, medio, alto)
N_CLASES = 3
y = pd.qcut(y_cont, q=N_CLASES, labels=False, duplicates="drop")

In [7]:
# Convertir variables categóricas a numéricas (one-hot encoding)
X = pd.get_dummies(X, drop_first=True)

print("Shape después de dummies:", X.shape)

# Escalar
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split train/test
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.3, random_state=101, stratify=y
)

Shape después de dummies: (10000, 21727)


In [8]:
# Baseline con k=24
knn = KNeighborsClassifier(n_neighbors=24, weights="uniform", p=2)
knn.fit(X_train, y_train)
pred = knn.predict(X_test)

acc = accuracy_score(y_test, pred)
err = (1 - acc) * 100
print("\n=== BASELINE (k=24) ===")
print(f"Accuracy : {acc*100:.2f}%")
print(f"Tasa de error: {err:.2f}%")
print(classification_report(y_test, pred, digits=4))



=== BASELINE (k=24) ===
Accuracy : 33.33%
Tasa de error: 66.67%
              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000      1000
           1     0.3333    1.0000    0.5000      1000
           2     0.0000    0.0000    0.0000      1000

    accuracy                         0.3333      3000
   macro avg     0.1111    0.3333    0.1667      3000
weighted avg     0.1111    0.3333    0.1667      3000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
