In [None]:
!pip install optuna

Collecting optuna
  Downloading optuna-3.6.1-py3-none-any.whl (380 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.1/380.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.1-py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.4/233.4 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl (11 kB)
Collecting sqlalchemy>=1.3.0 (from optuna)
  Downloading SQLAlchemy-2.0.30-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m28.1 MB/s[0m eta [36m0:00:00[0m
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.5-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
Collecting 

In [None]:
!pip install tabpfn

Collecting tabpfn
  Downloading tabpfn-0.1.10-py3-none-any.whl (156 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m156.6/156.6 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tabpfn
Successfully installed tabpfn-0.1.10


In [None]:
!pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-4.3.0-py3-none-manylinux_2_28_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: lightgbm
Successfully installed lightgbm-4.3.0


In [None]:
import numpy as np
import pandas as pd
import random

import sklearn
import sklearn.metrics
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve, auc, precision_score, recall_score, f1_score, classification_report, accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import StratifiedKFold
from sklearn.calibration import calibration_curve
from scipy import stats as st
from random import randrange
from matplotlib import pyplot
from sklearn.utils import resample

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
import lightgbm as lgb
import torch
from tabpfn import TabPFNClassifier

import optuna
from optuna.samplers import TPESampler

In [None]:
data = pd.read_csv('/data/MICE_data.csv', index_col=0)
outcomes = ['Diagnosis']
x = data.drop(outcomes, axis = 1)
y = data['Diagnosis']

In [None]:
x.shape,y.shape

((248, 8), (248,))

In [None]:
from sklearn.model_selection import RepeatedStratifiedKFold
skf = RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=42)

In [None]:
def objective(trial, model_type, x, y, skf):
    if model_type == 'TabPFN':
        params = {
            'N_ensemble_configurations': trial.suggest_int('N_ensemble_configurations', 1, 150),
        }
        device = "cuda" if torch.cuda.is_available() else "cpu"
        model = TabPFNClassifier(device=device, **params)

        np.random.seed(42)
        random.seed(42)

    elif model_type == 'lightgbm':
        params = {
            'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
            'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
            'num_leaves': trial.suggest_int('num_leaves', 2, 256),
            'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
            'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
            'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
            'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
            'verbose': -1,
            'random_state': 42
        }
        model = lgb.LGBMClassifier(**params)

    elif model_type == 'random_forest':
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
            'max_depth': trial.suggest_int('max_depth', 5, 30),
            'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
            'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
            'max_features': trial.suggest_categorical('max_features', ['auto', 'sqrt']),
            'bootstrap': trial.suggest_categorical('bootstrap', [True, False]),
            'random_state': 42
        }
        model = RandomForestClassifier(**params)

    elif model_type == 'logistic_regression':
        params = {
            'C': trial.suggest_float('C', 1e-4, 10.0),
            'solver': 'liblinear',
            'random_state': 42
        }
        model = LogisticRegression(**params)

    elif model_type == 'neural_network':
        hidden_layer_options = [
            (n,) for n in range(10, 301, 10)
        ] + [
            (n, m) for n in range(10, 201, 10) for m in range(10, 201, 10)
        ] + [
            (n, m, k) for n in range(10, 151, 10) for m in range(10, 151, 10) for k in range(10, 151, 10)
        ]

        params = {
            'hidden_layer_sizes': trial.suggest_categorical('hidden_layer_sizes', hidden_layer_options),
            'activation': trial.suggest_categorical('activation', ['tanh', 'relu']),
            'solver': trial.suggest_categorical('solver', ['sgd', 'adam']),
            'alpha': trial.suggest_loguniform('alpha', 0.0001, 0.1),
            'learning_rate': trial.suggest_categorical('learning_rate', ['constant', 'adaptive']),
            'max_iter': trial.suggest_int('max_iter', 200, 2000),
            'random_state': 42
        }
        model = MLPClassifier(**params)

        np.random.seed(42)
        random.seed(42)

    roc_auc = []
    for train_idx, valid_idx in skf.split(x, y):
        X_train, X_valid = x.iloc[train_idx], x.iloc[valid_idx]
        Y_train, Y_valid = y.iloc[train_idx], y.iloc[valid_idx]

        model.fit(X_train, Y_train)
        Y_pred = model.predict_proba(X_valid)[:, 1]
        roc_auc.append(roc_auc_score(Y_valid, Y_pred))

    return np.mean(roc_auc)

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
best_params = {}

models = ['TabPFN', 'lightgbm', 'random_forest', 'logistic_regression', 'neural_network']
for model in models:
    sampler = TPESampler(seed=42)
    study = optuna.create_study(direction='maximize', sampler=sampler)
    study.optimize(lambda trial: objective(trial, model, x, y, skf), n_trials=150)
    best_params[model] = study.best_params

[I 2024-06-08 12:37:32,775] A new study created in memory with name: no-name-469772a2-949d-4aaa-bf0d-60ab40f146d6


We have to download the TabPFN, as there is no checkpoint at  /usr/local/lib/python3.10/dist-packages/tabpfn/models_diff/prior_diff_real_checkpoint_n_0_epoch_100.cpkt
It has about 100MB, so this might take a moment.


[1;30;43mストリーミング出力は最後の 5000 行に切り捨てられました。[0m
  'alpha': trial.suggest_loguniform('alpha', 0.0001, 0.1),
[I 2024-06-08 14:32:15,359] Trial 149 finished with value: 0.842123440285205 and parameters: {'hidden_layer_sizes': (120, 70, 100), 'activation': 'relu', 'solver': 'adam', 'alpha': 0.00019422699716911926, 'learning_rate': 'adaptive', 'max_iter': 253}. Best is trial 52 with value: 0.8800579322638146.


In [None]:
for model in models:
    print(f"Best hyperparameters for {model}: ", best_params[model])

Best hyperparameters for TabPFN:  {'N_ensemble_configurations': 57}
Best hyperparameters for lightgbm:  {'lambda_l1': 3.4225810502969627, 'lambda_l2': 0.00015468332312824296, 'num_leaves': 12, 'feature_fraction': 0.4342310907933723, 'bagging_fraction': 0.9346679213739028, 'bagging_freq': 6, 'min_child_samples': 12}
Best hyperparameters for random_forest:  {'n_estimators': 428, 'max_depth': 5, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'bootstrap': True}
Best hyperparameters for logistic_regression:  {'C': 9.863752062156355}
Best hyperparameters for neural_network:  {'hidden_layer_sizes': (50, 10, 10), 'activation': 'relu', 'solver': 'adam', 'alpha': 0.012602250423851701, 'learning_rate': 'adaptive', 'max_iter': 339}


  and should_run_async(code)


In [None]:
import pickle
import os

save_folder = '/data/result'
os.makedirs(save_folder, exist_ok=True)

for model in models:
    save_path = os.path.join(save_folder, f"{model}_best_params.pkl")
    with open(save_path, "wb") as f:
        pickle.dump(best_params[model], f)