In [1]:
import optuna
from sklearn.svm import SVC
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
import shap
from sklearn .metrics import roc_auc_score

In [2]:
data = pd.read_csv('encoded_training.csv')
data

Unnamed: 0,ID,PATID,UCX_abnormal,ua_bacteria,ua_bili,ua_blood,ua_clarity,ua_color,ua_epi,ua_glucose,...,ua_wbc,age,Urinary_tract_infections,abxUTI,ethnicity_Hispanic or Latino,ethnicity_Non-Hispanic,ethnicity_Patient Refused,ethnicity_Unknown,ethnicity_not_reported,Female
0,34660,22882,1,4,0,0,1,1,1,0,...,2,53,0,1,False,True,False,False,False,True
1,21796,14379,0,1,1,0,0,1,1,0,...,1,30,0,1,False,True,False,False,False,True
2,73901,50935,1,2,0,3,1,1,-1,0,...,3,52,0,1,False,True,False,False,False,True
3,31039,20477,0,2,0,1,-1,1,1,0,...,2,43,0,1,False,True,False,False,False,False
4,52301,35273,1,4,0,0,1,1,3,0,...,2,49,0,1,True,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13271,72561,50001,0,4,1,0,1,1,3,0,...,2,54,0,0,False,True,False,False,False,True
13272,9820,6476,0,-1,0,0,0,1,-1,0,...,-1,52,0,0,False,True,False,False,False,False
13273,49819,33460,1,2,0,2,1,1,1,0,...,2,42,0,1,False,True,False,False,False,True
13274,17174,11391,1,1,0,0,0,0,1,0,...,2,33,0,1,False,True,False,False,False,True


In [3]:
# Define features and target
X = data.drop(columns=['UCX_abnormal', 'ID', 'PATID'])  # Dropping ID columns and target
y = data['UCX_abnormal']

In [4]:
# Optional: Standardize features if necessary
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [5]:
# Define the Optuna objective function for SVM
def objective(trial):
    # Suggest hyperparameters for the SVM model
    C = trial.suggest_float('C', 1e-3, 1e2)
    kernel = trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf', 'sigmoid'])
    
    # Additional kernel-specific parameters
    if kernel in ['poly', 'rbf', 'sigmoid']:
        gamma = trial.suggest_float('gamma', 1e-4, 1e1)
    else:
        gamma = 'scale'  # Use default for 'linear'
    
    # Define the SVM model
    model = SVC(C=C, kernel=kernel, gamma=gamma)
    
    # Perform cross-validation
    score = cross_val_score(model, X_scaled, y, cv=3, scoring='accuracy')
    
    # Return the negative mean score (Optuna minimizes)
    return -score.mean()


In [6]:
# Create a study object for SVM
study = optuna.create_study(direction='minimize') 

[I 2024-10-01 14:49:18,469] A new study created in memory with name: no-name-cfe244c4-f105-46d3-9c20-b1e7cf32ea7c


In [7]:
# Optimize the study
study.optimize(objective, n_trials=30)  

[I 2024-10-01 14:53:12,273] Trial 0 finished with value: -0.76732460048183 and parameters: {'C': 26.690588518620196, 'kernel': 'linear'}. Best is trial 0 with value: -0.76732460048183.
[I 2024-10-01 14:53:52,409] Trial 1 finished with value: -0.6627748886693338 and parameters: {'C': 60.324945850586865, 'kernel': 'rbf', 'gamma': 1.0095435760950284}. Best is trial 0 with value: -0.76732460048183.
[I 2024-10-01 14:54:02,344] Trial 2 finished with value: -0.6271462841980661 and parameters: {'C': 96.39562146384851, 'kernel': 'sigmoid', 'gamma': 8.590553120175333}. Best is trial 0 with value: -0.76732460048183.
[I 2024-10-01 14:55:02,413] Trial 3 finished with value: -0.5826300673217583 and parameters: {'C': 62.98927495036479, 'kernel': 'rbf', 'gamma': 7.297936657836365}. Best is trial 0 with value: -0.76732460048183.
[I 2024-10-01 14:55:12,199] Trial 4 finished with value: -0.6263930055493008 and parameters: {'C': 42.69068524077496, 'kernel': 'sigmoid', 'gamma': 3.781947151456459}. Best is 

In [8]:
# Get the best parameters
print(f"Best parameters: {study.best_params}")
print(f"Best score: {-study.best_value}")

Best parameters: {'C': 6.250114313793128, 'kernel': 'sigmoid', 'gamma': 0.001666639637443693}
Best score: 0.766194520820728


In [9]:
# Train the final model using the best parameters
best_params = study.best_params
final_model = SVC(**best_params)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train the model on the training set
final_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = final_model.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

In [10]:
# Print the metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

Accuracy: 0.7624
Precision: 0.7959
Recall: 0.7321
F1 Score: 0.7627
Confusion Matrix:
[[1011  260]
 [ 371 1014]]
Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.80      0.76      1271
           1       0.80      0.73      0.76      1385

    accuracy                           0.76      2656
   macro avg       0.76      0.76      0.76      2656
weighted avg       0.77      0.76      0.76      2656

