In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
from scipy import sparse

from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

In [4]:
df = pd.read_csv("diabetic_data.csv")
ids_mapping = pd.read_csv("IDS_mapping.csv")

Preprocessing

In [5]:
# Replace "?" with NaN for simplicity
df = df.replace("?", np.nan)

# Create binary target: 1 = readmitted <30 days, 0 = otherwise
if "readmitted" in df.columns:
    df["target"] = df["readmitted"].apply(lambda x: 1 if x == "<30" else 0)
    df = df.drop(["readmitted"], axis=1)

# Drop ID and unhelpful columns
drop_cols = [
    "encounter_id",               # Unique identifier
    "patient_nbr",                # Unique per person
    "weight",                     # ~97% missing
    "payer_code",                 # ~40% missing
    "medical_specialty",          # High cardinality
    "examide",                    # Almost all 0s
    "citoglipton",                # Almost all 0s
    "metformin-rosiglitazone",    # Deprecated, rare
    "metformin-pioglitazone"      # Deprecated, rare
]

# Add any overlapping columns from ids_mapping (if applicable)
drop_cols += [col for col in ids_mapping.columns if col in df.columns]

# Drop specified columns safely
df = df.drop(columns=drop_cols, errors='ignore')

# Drop columns with excessive missing values (>50%) or high cardinality objects
df = df.loc[:, df.isnull().mean() < 0.5]
df = df.drop(columns=[
    col for col in df.select_dtypes(include="object") 
    if df[col].nunique() > 50
])

# Drop remaining rows with any missing values
df = df.dropna()

# One-hot encode categorical features
df_dummies = pd.get_dummies(df, drop_first=True, dtype=float)

# Separate features and target
X = df_dummies.drop("target", axis=1)
y = df_dummies["target"]

# Convert to sparse matrix
X_sparse = sparse.csr_matrix(X.values)

# Train/test split
X_train_sparse, X_test_sparse, y_train, y_test = train_test_split(
    X_sparse, y, test_size=0.3, random_state=42
)

# Scale features without centering (sparse-compatible)
scaler = StandardScaler(with_mean=False)
X_train_scaled = scaler.fit_transform(X_train_sparse)
X_test_scaled = scaler.transform(X_test_sparse)

# Prepare metrics dictionary
metrics = {}

In [None]:
'''
    decided to use tsne because it allows us to turn this multi dimensional space into a 2d space
    therefore, we can see what model is the best to use, as a guess  
    then we can compare THIS inital guess to the FINAL guess
'''
tsne = TSNE(n_components=2, random_state=42, perplexity=30, max_iter=1000)
X_embedded = tsne.fit_transform(X_train_scaled.toarray())

# Plot
plt.figure(figsize=(10, 6))
# 0 (NOT readmitted) = blue || 1 (readmitted) = red
scatter = plt.scatter(X_embedded[:, 0], X_embedded[:, 1], c=y_train, cmap="coolwarm", alpha=0.6)
plt.title()
plt.xlabel("x vals")
plt.ylabel("y vals")
plt.colorbar(scatter, label="Readmitted (<30 days = 1)")
plt.grid(True)
plt.show()

k-Nearest Neighors

In [None]:
knn = KNeighborsClassifier(n_neighbors=5)

# Convert sparse matrices to dense arrays
X_train_dense = X_train_scaled.toarray()
X_test_dense = X_test_scaled.toarray()

knn.fit(X_train_dense, y_train)
y_pred_knn = knn.predict(X_test_dense)

metrics["kNN"] = {
    "Accuracy": accuracy_score(y_test, y_pred_knn),
    "Precision": precision_score(y_test, y_pred_knn),
    "Recall": recall_score(y_test, y_pred_knn),
    "F1": f1_score(y_test, y_pred_knn),
    "AUC": roc_auc_score(y_test, y_pred_knn)
}

for metric_name, value in metrics["kNN"].items():
    print(f"{metric_name}: {value:.4f}")

Logistic Regression

In [None]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_scaled, y_train)
y_pred_lr = lr.predict(X_test_scaled)

metrics["Logistic Regression"] = {
    "Accuracy": accuracy_score(y_test, y_pred_lr),
    "Precision": precision_score(y_test, y_pred_lr),
    "Recall": recall_score(y_test, y_pred_lr),
    "F1": f1_score(y_test, y_pred_lr),
    "AUC": roc_auc_score(y_test, y_pred_knn)
}

for metric_name, value in metrics["Logistic Regression"].items():
    print(f"{metric_name}: {value:.4f}")

Feedforward Neural Network

In [None]:
mlp = MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=1000, random_state=42)
mlp.fit(X_train_scaled, y_train)
y_pred_mlp = mlp.predict(X_test_scaled)

metrics["Feedforward NN"] = {
    "Accuracy": accuracy_score(y_test, y_pred_mlp),
    "Precision": precision_score(y_test, y_pred_mlp),
    "Recall": recall_score(y_test, y_pred_mlp),
    "F1": f1_score(y_test, y_pred_mlp),
    "AUC": roc_auc_score(y_test, y_pred_knn)
}
for metric_name, value in metrics["Feedforward NN"].items():
    print(f"{metric_name}: {value:.4f}")

Decision Tree

In [None]:
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train_scaled, y_train)
y_pred_dt = dt.predict(X_test_scaled)

metrics["Decision Tree"] = {
    "Accuracy": accuracy_score(y_test, y_pred_dt),
    "Precision": precision_score(y_test, y_pred_dt),
    "Recall": recall_score(y_test, y_pred_dt),
    "F1": f1_score(y_test, y_pred_dt),
    "AUC": roc_auc_score(y_test, y_pred_knn)
}
for metric_name, value in metrics["Decision Tree"].items():
    print(f"{metric_name}: {value:.4f}")

Confusion Matrices

In [None]:
def plot_confusion(title, y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap="Blues")
    plt.title(title)
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()

plot_confusion("kNN Confusion Matrix", y_test, y_pred_knn)
plot_confusion("Logistic Regression Confusion Matrix", y_test, y_pred_lr)
plot_confusion("Feedforward NN Confusion Matrix", y_test, y_pred_mlp)
plot_confusion("Decision Tree Confusion Matrix", y_test, y_pred_dt)

Visualization of Metrics

In [None]:
def plot_metrics(metrics_dict):
    metrics_names = ["Accuracy", "Precision", "Recall", "F1", "AUC"]
    classifiers = list(metrics_dict.keys())
    values = {metric: [metrics_dict[classifier][metric] for classifier in classifiers] for metric in metrics_names}

    x = np.arange(len(classifiers))
    width = 0.2

    plt.figure(figsize=(10,6))
    for i, metric in enumerate(metrics_names):
        plt.bar(x + i*width, values[metric], width=width, label=metric)

    plt.xticks(x + width*1.5, classifiers)
    plt.ylabel("Score")
    plt.title("Metrics by Classifier")
    plt.ylim(0, 1)
    plt.legend()
    plt.tight_layout()
    plt.show()

plot_metrics(metrics)