In [58]:
import pandas as pd
import numpy as np
import optuna
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, cross_val_score
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn .metrics import roc_auc_score

In [59]:
data = pd.read_csv('encoded_training.csv')
data

Unnamed: 0,ID,PATID,UCX_abnormal,ua_bacteria,ua_bili,ua_blood,ua_clarity,ua_color,ua_epi,ua_glucose,...,ua_wbc,age,Urinary_tract_infections,abxUTI,ethnicity_Hispanic or Latino,ethnicity_Non-Hispanic,ethnicity_Patient Refused,ethnicity_Unknown,ethnicity_negative,Female
0,13977,9243,0,1,0,3,0,1,1,1,...,1,53,0,1,False,True,False,False,False,True
1,884,576,0,0,0,0,0,1,0,0,...,0,50,0,0,False,True,False,False,False,False
2,39389,26105,0,0,0,0,0,1,0,0,...,0,26,0,0,True,False,False,False,False,False
3,46117,30879,0,1,0,2,1,4,0,0,...,1,43,0,0,True,False,False,False,False,False
4,47879,32042,0,2,0,0,1,1,3,0,...,1,24,0,1,False,True,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18491,12452,8214,0,0,0,2,1,1,1,2,...,1,51,0,1,False,True,False,False,False,True
18492,69084,47703,1,2,0,3,1,4,1,0,...,3,47,0,1,False,True,False,False,False,True
18493,17545,11612,1,2,0,0,1,1,1,1,...,2,58,0,1,False,True,False,False,False,True
18494,14717,9744,0,0,0,0,0,0,0,0,...,0,44,0,0,False,True,False,False,False,True


In [60]:
# Define features and target
X = data.drop(columns=['UCX_abnormal', 'ID', 'PATID'])  # Dropping ID columns and target
y = data['UCX_abnormal']

In [61]:
# Optional: Standardize features if necessary
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [62]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [63]:
# Define the Optuna objective function for logistic regression
def objective(trial):
    # Suggest the regularization strength 'C' (inverse of regularization strength)
    C = trial.suggest_float('C', 1e-5, 1e2)

    tol = trial.suggest_loguniform('tol', 1e-6, 1e-2)
    
    # Define the logistic regression model
    model = LogisticRegression(C=C, max_iter=1000, solver='liblinear', tol=tol)
    
    # Perform cross-validation
    score = cross_val_score(model, X_scaled, y, cv=3, scoring='roc_auc')
    
    # Return the negative mean score (Optuna minimizes)
    return -score.mean()

In [64]:
# Create a study object for logistic regression
study = optuna.create_study(direction='maximize')  # Minimize the negative accuracy

[I 2024-10-05 17:55:44,873] A new study created in memory with name: no-name-398ab82b-99bd-44cc-bf96-077e949eb107


In [65]:
# Optimize the study
study.optimize(objective, n_trials=100)  # Run the optimization for 50 trials

  tol = trial.suggest_loguniform('tol', 1e-6, 1e-2)
[I 2024-10-05 17:55:45,090] Trial 0 finished with value: -0.8655174780796373 and parameters: {'C': 15.235892956167163, 'tol': 0.00033781866635427957}. Best is trial 0 with value: -0.8655174780796373.
  tol = trial.suggest_loguniform('tol', 1e-6, 1e-2)
[I 2024-10-05 17:55:45,287] Trial 1 finished with value: -0.8655180950130194 and parameters: {'C': 11.359187923251204, 'tol': 3.1514164091912736e-05}. Best is trial 0 with value: -0.8655174780796373.
  tol = trial.suggest_loguniform('tol', 1e-6, 1e-2)
[I 2024-10-05 17:55:45,476] Trial 2 finished with value: -0.8655173833055793 and parameters: {'C': 73.47922956376247, 'tol': 0.00016574218959840062}. Best is trial 2 with value: -0.8655173833055793.
  tol = trial.suggest_loguniform('tol', 1e-6, 1e-2)
[I 2024-10-05 17:55:45,699] Trial 3 finished with value: -0.8655188536670924 and parameters: {'C': 51.15992423786562, 'tol': 5.8495579914184475e-06}. Best is trial 2 with value: -0.865517383305

In [66]:
# Get the best parameters
print(f"Best parameters: {study.best_params}")
print(f"Best score: {-study.best_value}")

Best parameters: {'C': 18.669814060809298, 'tol': 0.001913665264931222}
Best score: 0.8655149682677749


In [67]:
# Train the final model using the best parameters
best_params = study.best_params
final_model = LogisticRegression(**best_params, max_iter=1000, solver='liblinear')

In [68]:

# Train the model on the training set
final_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = final_model.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
auc = np.round(roc_auc_score(y_test, y_pred), 3)


In [69]:
# Print the metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)
print("AUC for the data is {}".format(auc))

Accuracy: 0.8549
Precision: 0.7621
Recall: 0.5696
F1 Score: 0.6520
Confusion Matrix:
[[2660  157]
 [ 380  503]]
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.94      0.91      2817
           1       0.76      0.57      0.65       883

    accuracy                           0.85      3700
   macro avg       0.82      0.76      0.78      3700
weighted avg       0.85      0.85      0.85      3700

AUC for the data is 0.757
