In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv(r"C:\Users\Coditas\Desktop\Projects\Cauti\silver\silver_dataset.csv")
leakage_columns = [

    # # ----- Direct infection evidence -----
    "urinalysis_wbc",
    # "urinalysis_rbc",
    # "urine_culture_performed",
    # "cfu_count_measured",
    # "gram_negative_organisms_present",
    # "gram_positive_organisms_present",
    # "fungi_present",
    # "blood_culture_performed",
    # "nitrite_tested",
    # "nitrite_positive",

    # # ----- Lab & biomarker responses -----
    # "blood_wbc",
    # "creatinine",
    # "procalcitonin_measured",
    # "blood_crp_measured",

    # # ----- Vital signs reflecting infection -----
    # "temperature",
    # "heart_rate",
    # "resp_rate",

    # # ----- Clinical response to infection -----
    "antibiotics_per_admission",
    "recent_antibiotic_use",

    # # ----- Explicit UTI / CAUTI knowledge -----
    "other_uti_present",
    "other_uti_uti_unspecified",
    "other_uti_cystitis",
    "other_uti_pyelonephritis",
    "other_uti_urethritis",
    "recurrent_uti",

    # # ----- Outcome proxy -----
    # "has_cauti_history",

    # # ----- Renal output (often reactionary) -----
    # "oliguria",

    # ----- subject_id and hadm_id -----
    "subject_id",
    "hadm_id",
    "catheter_present",
    
]
X = df.drop(columns=["y"] + leakage_columns)
y = df["y"].astype(int)

In [None]:
df.head()

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

In [None]:
# Fill remaining NaNs with 0 (safe because flags exist)
X_train = X_train.fillna(0)
X_val   = X_val.fillna(0)

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled   = scaler.transform(X_val)

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

model = Sequential([
    Dense(128, activation="relu", input_shape=(X_train_scaled.shape[1],)),
    Dropout(0.3),
    Dense(64, activation="relu"),
    Dropout(0.3),
    Dense(1, activation="sigmoid")
])

model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["AUC", "accuracy"]
)

In [None]:
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.array([0, 1]),
    y=y_train
)

class_weight_dict = {0: class_weights[0], 1: class_weights[1]}


In [None]:
early_stop = EarlyStopping(
    monitor="val_AUC",
    patience=5,
    mode="max",
    restore_best_weights=True
)

history = model.fit(
    X_train_scaled, y_train,
    validation_data=(X_val_scaled, y_val),
    epochs=50,
    batch_size=256,
    callbacks=[early_stop],
    class_weight=class_weight_dict,
    verbose=1
)


In [None]:
from sklearn.metrics import classification_report, roc_auc_score

# Predicted probabilities
y_val_prob = model.predict(X_val_scaled).ravel()

In [None]:
from sklearn.metrics import precision_recall_curve, classification_report
import numpy as np

precision, recall, thresholds = precision_recall_curve(y_val, y_val_prob)

target_recall = 0.80
idx = np.where(recall >= target_recall)[0][-1]

best_threshold = thresholds[idx]
print("Best threshold (recall-based):", best_threshold)

# Apply best threshold
y_val_pred_best = (y_val_prob >= best_threshold).astype(int)

print(classification_report(y_val, y_val_pred_best))

In [None]:
from sklearn.metrics import roc_auc_score
print("ROC AUC:", roc_auc_score(y_val, y_val_prob))

# ROC curve

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score

fpr, tpr, _ = roc_curve(y_val, y_val_prob)
auc = roc_auc_score(y_val, y_val_prob)

plt.figure(figsize=(6, 5))
plt.plot(fpr, tpr, label=f"ROC AUC = {auc:.3f}")
plt.plot([0, 1], [0, 1], linestyle="--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate (Recall)")
plt.title("ROC Curve – CAUTI Classification")
plt.legend()
plt.grid(True)
plt.show()


# Precision Recall Curve

In [None]:
from sklearn.metrics import precision_recall_curve, average_precision_score

precision, recall, _ = precision_recall_curve(y_val, y_val_prob)
ap = average_precision_score(y_val, y_val_prob)

plt.figure(figsize=(6, 5))
plt.plot(recall, precision, label=f"AP = {ap:.3f}")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision–Recall Curve – CAUTI Classification")
plt.legend()
plt.grid(True)
plt.show()


In [None]:
corr = X.corrwith(y).abs().sort_values(ascending=False)

high_corr = corr[corr >= 0.3]

print(high_corr)
