In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import f1_score, jaccard_score, hamming_loss, precision_recall_curve


df = pd.read_csv("/home/chanbo.s/personalized_ecoli/new_code/merged_microbiology_admissions_final.csv.csv")

bins = [0, 20, 40, 60, 80, 100]
labels = ['0-20', '21-40', '41-60', '61-80', '80+']
df['age_group'] = pd.cut(df['anchor_age'], bins=bins, labels=labels, include_lowest=True)


df['effective_antibiotics'] = df['effective_antibiotics'].apply(
    lambda x: [ant.strip() for ant in x.strip("[]").replace("'", "").split(",") if ant.strip()]
)


encoder = OneHotEncoder(sparse_output=False, drop='first')
X = encoder.fit_transform(df[['gender', 'age_group']])

mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['effective_antibiotics'])


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

base_classifiers = [
    ('rf', RandomForestClassifier(n_estimators=300, max_depth=30, class_weight='balanced', random_state=42)),
    ('et', ExtraTreesClassifier(n_estimators=300, max_depth=30, class_weight='balanced', random_state=42))
]

stacking_model = StackingClassifier(
    estimators=base_classifiers,
    final_estimator=LogisticRegression(max_iter=1000, class_weight='balanced'),
    passthrough=True,
    cv=5
)

model = MultiOutputClassifier(stacking_model)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
f1 = f1_score(y_test, y_pred, average='macro')
jaccard = jaccard_score(y_test, y_pred, average='samples')
hamming = hamming_loss(y_test, y_pred)

print(f"\nBase Evaluation:")
print(f"F1 Score (macro): {f1:.3f}")
print(f"Jaccard Similarity: {jaccard:.3f}")
print(f"Hamming Loss: {hamming:.3f}")


y_pred_proba = np.array([clf.predict_proba(X_test)[:, 1] for clf in model.estimators_]).T
optimal_thresholds = []

for i in range(y.shape[1]):
    precision, recall, thresholds = precision_recall_curve(y_test[:, i], y_pred_proba[:, i])
    f1_scores = 2 * precision * recall / (precision + recall + 1e-8)
    best_thresh = thresholds[np.argmax(f1_scores)] if len(thresholds) > 0 else 0.5
    optimal_thresholds.append(best_thresh)


y_pred_opt = (y_pred_proba >= optimal_thresholds).astype(int)


f1_opt = f1_score(y_test, y_pred_opt, average='macro')
jaccard_opt = jaccard_score(y_test, y_pred_opt, average='samples')
hamming_opt = hamming_loss(y_test, y_pred_opt)

print(f"\nAfter Threshold Tuning:")
print(f"F1 Score (macro): {f1_opt:.3f}")
print(f"Jaccard Similarity: {jaccard_opt:.3f}")
print(f"Hamming Loss: {hamming_opt:.3f}")


def predict_antibiotics(gender, age):
    age_group = pd.cut([age], bins=bins, labels=labels, include_lowest=True)[0]
    input_df = pd.DataFrame({'gender': [gender], 'age_group': [age_group]})
    input_encoded = encoder.transform(input_df)

 
    probs = np.array([clf.predict_proba(input_encoded)[:, 1] for clf in model.estimators_]).T


    binary_preds = (probs >= optimal_thresholds).astype(int)
    recommendations = mlb.inverse_transform(binary_preds)
    return recommendations[0]


gender, age = 'F', 30
recommended = predict_antibiotics(gender, age)
print(f"\nRecommended Antibiotics for {gender}, Age {age}: {recommended}")




Base Evaluation:
F1 Score (macro): 0.491
Jaccard Similarity: 0.513
Hamming Loss: 0.338





After Threshold Tuning:
F1 Score (macro): 0.568
Jaccard Similarity: 0.646
Hamming Loss: 0.281

Recommended Antibiotics for F, Age 30: ('AMPICILLIN', 'AMPICILLIN/SULBACTAM', 'CEFAZOLIN', 'CEFEPIME', 'CEFTAZIDIME', 'CEFTAZIDIME/AVIBACTAM', 'CEFTRIAXONE', 'CEFUROXIME', 'CIPROFLOXACIN', 'GENTAMICIN', 'IMIPENEM', 'LEVOFLOXACIN', 'MEROPENEM', 'NITROFURANTOIN', 'PIPERACILLIN', 'PIPERACILLIN/TAZO', 'TOBRAMYCIN', 'TRIMETHOPRIM/SULFA')
