In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import f1_score, jaccard_score, hamming_loss
import warnings
warnings.filterwarnings('ignore')


df = pd.read_csv("/home/chanbo.s/personalized_ecoli/new_code/merged_microbiology_admissions_final.csv.csv")
bins = [0, 20, 40, 60, 80, 100]
labels = ['0-20', '21-40', '41-60', '61-80', '80+']
df['age_group'] = pd.cut(df['anchor_age'], bins=bins, labels=labels)
df = df[['gender', 'age_group', 'effective_antibiotics']]
df['effective_antibiotics'] = df['effective_antibiotics'].apply(
    lambda x: [ant.strip() for ant in x.strip("[]").replace("'", "").split(",") if ant.strip()]
)


encoder = OneHotEncoder(sparse_output=False, drop='first')
encoded_features = encoder.fit_transform(df[['gender', 'age_group']])
X = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(['gender', 'age_group']))
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['effective_antibiotics'])


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


rf = MultiOutputClassifier(RandomForestClassifier(random_state=42))
param_grid = {
    'estimator__n_estimators': [100, 200],
    'estimator__max_depth': [10, 20, None],
    'estimator__min_samples_split': [2, 5],
    'estimator__min_samples_leaf': [1, 2]
}


clf = GridSearchCV(rf, param_grid, cv=3, scoring='f1_macro', verbose=1, n_jobs=-1)
clf.fit(X_train, y_train)

best_model = clf.best_estimator_
y_pred = best_model.predict(X_test)

f1 = f1_score(y_test, y_pred, average='macro')
jaccard = jaccard_score(y_test, y_pred, average='samples')
hamming = hamming_loss(y_test, y_pred)

print(f"F1 Score (macro): {f1:.2f}")
print(f"Jaccard Similarity: {jaccard:.2f}")
print(f"Hamming Loss: {hamming:.2f}")


def predict_antibiotics(gender, age, threshold=0.4):
    """Predict recommended antibiotics based on gender and age."""

    
    age_group = pd.cut([age], bins=[0,20,40,60,80,100], labels=labels)[0]

    
    input_df = pd.DataFrame({'gender':[gender],'age_group':[age_group]})
    input_encoded = encoder.transform(input_df)

  
    y_pred_proba = best_model.predict_proba(input_encoded)

    y_pred = []
    for proba in y_pred_proba:
        pred = proba[0, 1] >= threshold if proba.shape[1]==2 else False
        y_pred.append(pred)

    y_pred = np.array(y_pred).reshape(1,-1)

   
    recommended_antibiotics = mlb.inverse_transform(y_pred)

    return recommended_antibiotics

gender, age = 'F', 30
recommended = predict_antibiotics(gender, age)
print(f"Recommended Antibiotics for {gender}, Age {age}: {recommended}")


Fitting 3 folds for each of 24 candidates, totalling 72 fits
F1 Score (macro): 0.54
Jaccard Similarity: 0.83
Hamming Loss: 0.10
Recommended Antibiotics for F, Age 30: [('AMPICILLIN', 'AMPICILLIN/SULBACTAM', 'CEFAZOLIN', 'CEFEPIME', 'CEFTAZIDIME', 'CEFTRIAXONE', 'CIPROFLOXACIN', 'GENTAMICIN', 'MEROPENEM', 'NITROFURANTOIN', 'PIPERACILLIN/TAZO', 'TOBRAMYCIN', 'TRIMETHOPRIM/SULFA')]
