In [4]:
import numpy as np
import optuna

from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate, cross_val_score


In [2]:
cancer = datasets.load_breast_cancer()

X = cancer.data
y = 1 - cancer.target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=True, random_state=123
)


In [3]:
model = LogisticRegression(solver="lbfgs", max_iter=10000)
model.fit(X_train, y_train)

pred = model.predict(X_test)

score = 100 * accuracy_score(y_test, pred)
print("accuracy score: {}".format(score))


accuracy score: 98.24561403508771


In [5]:
scores = cross_val_score(model, X_train, y_train, cv=5)
print("fifth devided cross validation: {}".format(100 * scores.mean()))


fifth devided cross validation: 94.94505494505493


In [9]:
class Objective:
    def __init__(self, X: np.ndarray, y: np.ndarray) -> None:
        self.X = X
        self.y = y

    def __call__(self, trial: optuna.Trial) -> float:
        params = {
            "solver": trial.suggest_categorical(
                "solver", ["newton-cg", "lbfgs", "liblinear", "sag", "saga"]
            ),
            "C": trial.suggest_float("C", 1.0e-4, 10),
            "max_iter": trial.suggest_int("max_iter", 100, 100000)
        }
        model = LogisticRegression(**params)
        scores = cross_validate(model, X=self.X, y=self.y, scoring="accuracy", n_jobs=-1)

        return scores["test_score"].mean()

objective = Objective(X_train, y_train)
study = optuna.create_study(direction="maximize")
study.optimize(objective, timeout=60)

print("params: ", study.best_params)


[I 2023-11-12 23:50:30,338] A new study created in memory with name: no-name-29ea2381-5d5e-4119-91b6-9659539e5260
[I 2023-11-12 23:50:30,419] Trial 0 finished with value: 0.9494505494505494 and parameters: {'solver': 'newton-cg', 'C': 0.8479898214489653, 'max_iter': 35731}. Best is trial 0 with value: 0.9494505494505494.
[I 2023-11-12 23:50:30,444] Trial 1 finished with value: 0.9362637362637362 and parameters: {'solver': 'liblinear', 'C': 0.6743340311924438, 'max_iter': 65545}. Best is trial 0 with value: 0.9494505494505494.
[I 2023-11-12 23:50:31,108] Trial 2 finished with value: 0.9164835164835164 and parameters: {'solver': 'sag', 'C': 5.39664726564676, 'max_iter': 55331}. Best is trial 0 with value: 0.9494505494505494.
[I 2023-11-12 23:50:31,478] Trial 3 finished with value: 0.9494505494505494 and parameters: {'solver': 'lbfgs', 'C': 1.6685302813378748, 'max_iter': 39909}. Best is trial 0 with value: 0.9494505494505494.
[I 2023-11-12 23:50:32,163] Trial 4 finished with value: 0.916

params:  {'solver': 'lbfgs', 'C': 9.10999014351516, 'max_iter': 8103}


In [10]:
study.best_params, study.best_value


({'solver': 'lbfgs', 'C': 9.10999014351516, 'max_iter': 8103},
 0.9582417582417584)

In [13]:
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import precision_score, recall_score

model = LogisticRegression(
    solver=study.best_params["solver"],
    C=study.best_params["C"],
    max_iter=study.best_params["max_iter"]
)

model.fit(X_train, y_train)
pred = model.predict(X_test)

print("Accuracy: {:.5f}".format(100 * accuracy_score(y_test, pred)))

print(confusion_matrix(y_test, pred))

print("Precision: {:.5f}".format(100*precision_score(y_test, pred)))
print("Recall: {:.5f}".format(100*recall_score(y_test, pred)))


Accuracy: 98.24561
[[73  0]
 [ 2 39]]
Precision: 100.00000
Recall: 95.12195
