In [2]:
import os
# Отключает все логи TensorFlow (0 - все, 1 - предупреждения, 2 - ошибки, 3 - критичные ошибки)
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [3]:
from neptune_tensorflow_keras import NeptuneCallback  
from sklearn.model_selection import cross_val_score
from sklearn import datasets
import pandas as pd
import numpy as np
from typing import List, Union
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import neptune
import optuna
import tensorflow as tf
from tensorflow import keras
from keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
import itertools
from sklearn.preprocessing import LabelEncoder
from ray import tune
from ray.tune.schedulers import ASHAScheduler
from ray import train
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from dotenv import load_dotenv

# Подготовим данные

In [3]:
df = pd.read_csv('content/loan_data.csv')

In [4]:
df

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,loan_status
0,22.0,female,Master,71948.0,0,RENT,35000.0,PERSONAL,16.02,0.49,3.0,561,No,1
1,21.0,female,High School,12282.0,0,OWN,1000.0,EDUCATION,11.14,0.08,2.0,504,Yes,0
2,25.0,female,High School,12438.0,3,MORTGAGE,5500.0,MEDICAL,12.87,0.44,3.0,635,No,1
3,23.0,female,Bachelor,79753.0,0,RENT,35000.0,MEDICAL,15.23,0.44,2.0,675,No,1
4,24.0,male,Master,66135.0,1,RENT,35000.0,MEDICAL,14.27,0.53,4.0,586,No,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44995,27.0,male,Associate,47971.0,6,RENT,15000.0,MEDICAL,15.66,0.31,3.0,645,No,1
44996,37.0,female,Associate,65800.0,17,RENT,9000.0,HOMEIMPROVEMENT,14.07,0.14,11.0,621,No,1
44997,33.0,male,Associate,56942.0,7,RENT,2771.0,DEBTCONSOLIDATION,10.02,0.05,10.0,668,No,1
44998,29.0,male,Bachelor,33164.0,4,RENT,12000.0,EDUCATION,13.23,0.36,6.0,604,No,1


In [5]:
# Создаем LabelEncoder
label_encoders = {}  # Словарь для хранения энкодеров

# Применяем LabelEncoder ко всем нечисловым столбцам
for col in df.select_dtypes(include=["object", "category"]).columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])  # Кодируем
    label_encoders[col] = le  # Сохраняем энкодер

In [6]:
X = df.drop(columns=['loan_status'])  
y = df['loan_status'] 

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                  random_state=42)

## Вначале используем связку neptune + optuna

In [5]:
load_dotenv()

True

In [20]:
run = neptune.init_run(
        project=os.getenv("NEPTUNE_PROJECT"), 
        api_token=os.getenv("NEPTUNE_API_TOKEN"),
        capture_stdout=True,
        capture_stderr=True,
        capture_traceback=True,
        capture_hardware_metrics=True
    ) 

[neptune] [info   ] Neptune initialized. Open in the app: https://app.neptune.ai/saatarko/financescoring/e/FIN-9


In [21]:
def objective(trial):
    # Оптимизируемые гиперпараметры
    lr = trial.suggest_categorical('learning_rate', [0.1, 0.01, 0.001])
    batch_size = trial.suggest_categorical('batch_size', [32, 64, 128])
    epochs = trial.suggest_categorical('epochs', [50, 100, 350, 500, 400, 750])
    layers = trial.suggest_categorical('layers', [35, 50, 75, 100, 250, 350, 500])
    dropout = trial.suggest_float('dropout', 0.1, 0.3, step=0.1)
    activation = trial.suggest_categorical('activation', ["relu", "sigmoid", "tanh"])
    optimizer_name = trial.suggest_categorical('optimizer', ["Adam", "SGD", "RMSprop"])

    optimizer = {
        "Adam": tf.keras.optimizers.Adam(lr),
        "SGD": tf.keras.optimizers.SGD(lr),
        "RMSprop": tf.keras.optimizers.RMSprop(lr),
    }[optimizer_name]

    # Создаём модель
    model = tf.keras.Sequential([
        tf.keras.layers.Input(shape=(X_train.shape[1],)),
        tf.keras.layers.Dense(layers, activation=activation),
        tf.keras.layers.Dropout(dropout),
        tf.keras.layers.Dense(layers, activation=activation),
        tf.keras.layers.Dropout(dropout),
        tf.keras.layers.Dense(layers, activation=activation),
        tf.keras.layers.Dense(1, activation="sigmoid")
    ])

    model.compile(
        loss="binary_crossentropy",
        optimizer=optimizer,
        metrics=['accuracy']
    )

    # Логируем гиперпараметры
    run[f'trials/{trial.number}/parameters'] = {
        'learning_rate': lr,
        'batch_size': batch_size,
        'epochs': epochs,
        'layers': layers,
        'dropout': dropout,
        'activation': activation,
        'optimizer': optimizer_name
    }

    # Коллбэк для Neptune (отдельное пространство для каждого trial)
    neptune_cbk = NeptuneCallback(run=run, base_namespace=f"trials/{trial.number}/metrics")

    callbacks = [
        EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True),
        ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6),
        ModelCheckpoint(f"model_{optimizer_name}.keras", monitor='val_loss', save_best_only=True),
        neptune_cbk  # Neptune логирует метрики в конкретный trial
    ]

    # Обучаем модель
    history = model.fit(
        X_train,
        y_train,
        epochs=epochs,
        batch_size=batch_size,
        validation_data=(X_test, y_test),
        callbacks=callbacks,
        verbose=0,
        initial_epoch=0  # Убирает возможный сбой в step
    )

    # Оцениваем модель
    loss, accuracy = model.evaluate(X_test, y_test, verbose=0)

    # Логируем метрики в Neptune
    for epoch, (train_loss, val_loss, train_acc, val_acc) in enumerate(zip(
        history.history['loss'], history.history['val_loss'],
        history.history['accuracy'], history.history['val_accuracy']
    )):
        run[f"trials/{trial.number}/metrics/train_loss"].log(train_loss, step=epoch)
        run[f"trials/{trial.number}/metrics/val_loss"].log(val_loss, step=epoch)
        run[f"trials/{trial.number}/metrics/train_accuracy"].log(train_acc, step=epoch)
        run[f"trials/{trial.number}/metrics/val_accuracy"].log(val_acc, step=epoch)

    # Финальные метрики
    run[f"trials/{trial.number}/final/loss"] = loss
    run[f"trials/{trial.number}/final/accuracy"] = accuracy

    return accuracy  # Оптимизируем по accuracy

In [22]:
# Оптимизация с Optuna
study = optuna.create_study(direction="maximize", study_name="Neptune_Optimization")
study.optimize(objective, n_trials=20)

# Логируем лучшие параметры
best_params = study.best_params
best_accuracy = study.best_value

run.stop()

print(f"Best parameters: {best_params}")
print(f"Best accuracy: {best_accuracy}")

[I 2025-03-21 12:23:52,583] A new study created in memory with name: Neptune_Optimization


[I 2025-03-21 12:24:43,369] Trial 0 finished with value: 0.7766666412353516 and parameters: {'learning_rate': 0.01, 'batch_size': 64, 'epochs': 400, 'layers': 350, 'dropout': 0.1, 'activation': 'sigmoid', 'optimizer': 'Adam'}. Best is trial 0 with value: 0.7766666412353516.


[I 2025-03-21 12:25:33,376] Trial 1 finished with value: 0.7766666412353516 and parameters: {'learning_rate': 0.1, 'batch_size': 64, 'epochs': 50, 'layers': 500, 'dropout': 0.3, 'activation': 'relu', 'optimizer': 'RMSprop'}. Best is trial 0 with value: 0.7766666412353516.


[I 2025-03-21 12:25:45,256] Trial 2 finished with value: 0.7766666412353516 and parameters: {'learning_rate': 0.1, 'batch_size': 128, 'epochs': 750, 'layers': 250, 'dropout': 0.3, 'activation': 'relu', 'optimizer': 'SGD'}. Best is trial 0 with value: 0.7766666412353516.


[I 2025-03-21 12:25:59,505] Trial 3 finished with value: 0.7766666412353516 and parameters: {'learning_rate': 0.01, 'batch_size': 64, 'epochs': 500, 'layers': 75, 'dropout': 0.2, 'activation': 'relu', 'optimizer': 'SGD'}. Best is trial 0 with value: 0.7766666412353516.


[I 2025-03-21 12:26:25,990] Trial 4 finished with value: 0.7766666412353516 and parameters: {'learning_rate': 0.01, 'batch_size': 128, 'epochs': 500, 'layers': 100, 'dropout': 0.2, 'activation': 'sigmoid', 'optimizer': 'Adam'}. Best is trial 0 with value: 0.7766666412353516.


[I 2025-03-21 12:26:51,038] Trial 5 finished with value: 0.7766666412353516 and parameters: {'learning_rate': 0.1, 'batch_size': 128, 'epochs': 50, 'layers': 100, 'dropout': 0.1, 'activation': 'sigmoid', 'optimizer': 'SGD'}. Best is trial 0 with value: 0.7766666412353516.


[I 2025-03-21 12:28:23,868] Trial 6 finished with value: 0.7766666412353516 and parameters: {'learning_rate': 0.001, 'batch_size': 64, 'epochs': 500, 'layers': 500, 'dropout': 0.2, 'activation': 'relu', 'optimizer': 'Adam'}. Best is trial 0 with value: 0.7766666412353516.


[I 2025-03-21 12:28:43,443] Trial 7 finished with value: 0.7766666412353516 and parameters: {'learning_rate': 0.1, 'batch_size': 128, 'epochs': 750, 'layers': 100, 'dropout': 0.1, 'activation': 'sigmoid', 'optimizer': 'RMSprop'}. Best is trial 0 with value: 0.7766666412353516.


[I 2025-03-21 12:29:44,644] Trial 8 finished with value: 0.7766666412353516 and parameters: {'learning_rate': 0.01, 'batch_size': 128, 'epochs': 50, 'layers': 500, 'dropout': 0.3, 'activation': 'tanh', 'optimizer': 'Adam'}. Best is trial 0 with value: 0.7766666412353516.


[I 2025-03-21 12:30:20,945] Trial 9 finished with value: 0.7766666412353516 and parameters: {'learning_rate': 0.1, 'batch_size': 32, 'epochs': 50, 'layers': 100, 'dropout': 0.3, 'activation': 'tanh', 'optimizer': 'SGD'}. Best is trial 0 with value: 0.7766666412353516.


[I 2025-03-21 12:31:12,189] Trial 10 finished with value: 0.7850000262260437 and parameters: {'learning_rate': 0.001, 'batch_size': 32, 'epochs': 400, 'layers': 350, 'dropout': 0.1, 'activation': 'sigmoid', 'optimizer': 'Adam'}. Best is trial 10 with value: 0.7850000262260437.


[I 2025-03-21 12:32:48,876] Trial 11 finished with value: 0.824222207069397 and parameters: {'learning_rate': 0.001, 'batch_size': 32, 'epochs': 400, 'layers': 350, 'dropout': 0.1, 'activation': 'sigmoid', 'optimizer': 'Adam'}. Best is trial 11 with value: 0.824222207069397.


[I 2025-03-21 12:35:50,713] Trial 12 finished with value: 0.824999988079071 and parameters: {'learning_rate': 0.001, 'batch_size': 32, 'epochs': 400, 'layers': 350, 'dropout': 0.1, 'activation': 'sigmoid', 'optimizer': 'Adam'}. Best is trial 12 with value: 0.824999988079071.


[I 2025-03-21 12:36:23,709] Trial 13 finished with value: 0.824222207069397 and parameters: {'learning_rate': 0.001, 'batch_size': 32, 'epochs': 350, 'layers': 50, 'dropout': 0.1, 'activation': 'sigmoid', 'optimizer': 'Adam'}. Best is trial 12 with value: 0.824999988079071.


[I 2025-03-21 12:37:47,370] Trial 14 finished with value: 0.8192222118377686 and parameters: {'learning_rate': 0.001, 'batch_size': 32, 'epochs': 400, 'layers': 350, 'dropout': 0.1, 'activation': 'sigmoid', 'optimizer': 'Adam'}. Best is trial 12 with value: 0.824999988079071.


[I 2025-03-21 12:38:04,692] Trial 15 finished with value: 0.7766666412353516 and parameters: {'learning_rate': 0.001, 'batch_size': 32, 'epochs': 100, 'layers': 35, 'dropout': 0.2, 'activation': 'sigmoid', 'optimizer': 'Adam'}. Best is trial 12 with value: 0.824999988079071.


[I 2025-03-21 12:40:42,455] Trial 16 finished with value: 0.8247777819633484 and parameters: {'learning_rate': 0.001, 'batch_size': 32, 'epochs': 400, 'layers': 350, 'dropout': 0.1, 'activation': 'tanh', 'optimizer': 'RMSprop'}. Best is trial 12 with value: 0.824999988079071.


[I 2025-03-21 12:41:57,498] Trial 17 finished with value: 0.8247777819633484 and parameters: {'learning_rate': 0.001, 'batch_size': 32, 'epochs': 400, 'layers': 350, 'dropout': 0.2, 'activation': 'tanh', 'optimizer': 'RMSprop'}. Best is trial 12 with value: 0.824999988079071.


[I 2025-03-21 12:42:18,772] Trial 18 finished with value: 0.8240000009536743 and parameters: {'learning_rate': 0.001, 'batch_size': 32, 'epochs': 100, 'layers': 75, 'dropout': 0.1, 'activation': 'tanh', 'optimizer': 'RMSprop'}. Best is trial 12 with value: 0.824999988079071.


[I 2025-03-21 12:44:18,619] Trial 19 finished with value: 0.8243333101272583 and parameters: {'learning_rate': 0.001, 'batch_size': 32, 'epochs': 350, 'layers': 250, 'dropout': 0.2, 'activation': 'tanh', 'optimizer': 'RMSprop'}. Best is trial 12 with value: 0.824999988079071.


[neptune] [info   ] Shutting down background jobs, please wait a moment...
[neptune] [info   ] Done!
[neptune] [info   ] Waiting for the remaining 161 operations to synchronize with Neptune. Do not kill this process.
[neptune] [info   ] All 161 operations synced, thanks for waiting!
[neptune] [info   ] Explore the metadata in the Neptune app: https://app.neptune.ai/saatarko/financescoring/e/FIN-9/metadata
Best parameters: {'learning_rate': 0.001, 'batch_size': 32, 'epochs': 400, 'layers': 350, 'dropout': 0.1, 'activation': 'sigmoid', 'optimizer': 'Adam'}
Best accuracy: 0.824999988079071


## Используем Ray Tune

In [23]:
def train_model(config):
    optimizer_dict = {
        "Adam": tf.keras.optimizers.Adam(config["learning_rate"]),
        "SGD": tf.keras.optimizers.SGD(config["learning_rate"]),
        "RMSprop": tf.keras.optimizers.RMSprop(config["learning_rate"]),
    }
    optimizer = optimizer_dict[config["optimizer"]]
    
    model = tf.keras.Sequential([
        tf.keras.layers.Input(shape=(X_train.shape[1],)),
        tf.keras.layers.Dense(config["layers"], activation=config["activation"]),
        tf.keras.layers.Dropout(config["dropout"]),
        tf.keras.layers.Dense(config["layers"], activation=config["activation"]),
        tf.keras.layers.Dropout(config["dropout"]),
        tf.keras.layers.Dense(config["layers"], activation=config["activation"]),
        tf.keras.layers.Dense(1, activation="sigmoid")
    ])
    
    model.compile(
        loss="binary_crossentropy",
        optimizer=optimizer,
        metrics=['accuracy']
    )
    
    callbacks = [
        EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True),
        ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6),
        ModelCheckpoint(f"models/rtune/model_{config['optimizer']}.keras", monitor='val_loss', save_best_only=True)
    ]
    
    history = model.fit(
        X_train, y_train,
        epochs=config["epochs"],
        batch_size=config["batch_size"],
        validation_data=(X_test, y_test),
        callbacks=callbacks,
        verbose=0
    )
    
    loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
    train.report({"loss": loss, "accuracy": accuracy})  

search_space = {
    "learning_rate": tune.choice([0.1, 0.01, 0.001]),
    "batch_size": tune.choice([32, 64, 128]),
    "epochs": tune.choice([50, 100, 350, 500, 400, 750]),
    "layers": tune.choice([35, 50, 75, 100, 250, 350, 500]),
    "dropout": tune.uniform(0.1, 0.3),
    "activation": tune.choice(["relu", "sigmoid", "tanh"]),
    "optimizer": tune.choice(["Adam", "SGD", "RMSprop"])
}

tuner = tune.Tuner(
    train_model,
    param_space=search_space,
    tune_config=tune.TuneConfig(
        metric="accuracy",
        mode="max",
        num_samples=-1,
        scheduler=ASHAScheduler()
    )
)

In [24]:
tuner.fit()

0,1
Current time:,2025-03-21 13:12:41
Running for:,00:27:46.36
Memory:,13.9/15.5 GiB

Trial name,status,loc,activation,batch_size,dropout,epochs,layers,learning_rate,optimizer,iter,total time (s),loss,accuracy
train_model_24c4b_00011,RUNNING,10.8.37.30:69979,tanh,32,0.225304,500,350,0.1,Adam,,,,
train_model_24c4b_00024,RUNNING,10.8.37.30:84702,sigmoid,128,0.189647,100,350,0.001,SGD,,,,
train_model_24c4b_00038,RUNNING,10.8.37.30:104368,sigmoid,32,0.151268,500,350,0.1,RMSprop,,,,
train_model_24c4b_00039,RUNNING,10.8.37.30:104369,sigmoid,32,0.121553,100,350,0.01,RMSprop,,,,
train_model_24c4b_00050,RUNNING,10.8.37.30:116122,sigmoid,32,0.22115,100,250,0.1,Adam,,,,
train_model_24c4b_00055,RUNNING,10.8.37.30:123111,tanh,32,0.113438,750,350,0.1,SGD,,,,
train_model_24c4b_00056,RUNNING,10.8.37.30:124503,tanh,32,0.146097,750,250,0.001,RMSprop,,,,
train_model_24c4b_00061,RUNNING,10.8.37.30:131108,relu,128,0.207278,100,100,0.001,SGD,,,,
train_model_24c4b_00063,RUNNING,10.8.37.30:133465,relu,32,0.255344,400,500,0.001,RMSprop,,,,
train_model_24c4b_00066,RUNNING,10.8.37.30:135652,tanh,64,0.109241,350,500,0.001,SGD,,,,


2025-03-21 13:12:41,244	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/home/saatarko/ray_results/train_model_2025-03-21_12-44-52' in 0.1949s.
2025-03-21 13:12:51,986	INFO tune.py:1041 -- Total run time: 1677.23 seconds (1666.16 seconds for the tuning loop).
Resume experiment with: Tuner.restore(path="/home/saatarko/ray_results/train_model_2025-03-21_12-44-52", trainable=...)
- train_model_24c4b_00075: FileNotFoundError('Could not fetch metrics for train_model_24c4b_00075: both result.json and progress.csv were not found at /home/saatarko/ray_results/train_model_2025-03-21_12-44-52/train_model_24c4b_00075_75_activation=tanh,batch_size=128,dropout=0.2980,epochs=750,layers=500,learning_rate=0.0010,optimizer=RMSp_2025-03-21_12-45-05')
- train_model_24c4b_00076: FileNotFoundError('Could not fetch metrics for train_model_24c4b_00076: both result.json and progress.csv were not found at /home/saatarko/ray_results/train_model_2025-03-21_12-44-52/trai

ResultGrid<[
  Result(
    metrics={'loss': 0.5390077829360962, 'accuracy': 0.7766666412353516},
    path='/home/saatarko/ray_results/train_model_2025-03-21_12-44-52/train_model_24c4b_00000_0_activation=tanh,batch_size=128,dropout=0.1632,epochs=50,layers=500,learning_rate=0.1000,optimizer=Adam_2025-03-21_12-45-03',
    filesystem='local',
    checkpoint=None
  ),
  Result(
    metrics={'loss': 0.5311062932014465, 'accuracy': 0.7766666412353516},
    path='/home/saatarko/ray_results/train_model_2025-03-21_12-44-52/train_model_24c4b_00001_1_activation=relu,batch_size=128,dropout=0.2769,epochs=50,layers=100,learning_rate=0.1000,optimizer=Adam_2025-03-21_12-45-03',
    filesystem='local',
    checkpoint=None
  ),
  Result(
    metrics={'loss': 0.5310869216918945, 'accuracy': 0.7766666412353516},
    path='/home/saatarko/ray_results/train_model_2025-03-21_12-44-52/train_model_24c4b_00002_2_activation=tanh,batch_size=32,dropout=0.2558,epochs=750,layers=350,learning_rate=0.1000,optimizer=SGD_

In [25]:
all_results = tuner.get_results().get_dataframe()
print(all_results.sort_values(by="accuracy", ascending=False).head(10))  # Топ-10 конфигураций


        loss  accuracy   timestamp checkpoint_dir_name   done  \
20  0.446844  0.827778  1742551125                None  False   
29  0.457137  0.824444  1742551255                None  False   
44  0.452675  0.824333  1742551710                None  False   
27  0.455731  0.822889  1742551331                None  False   
7   0.453462  0.820778  1742551015                None  False   
22  0.458493  0.820444  1742550968                None  False   
40  0.467217  0.816000  1742551407                None  False   
37  0.446781  0.815778  1742551708                None  False   
32  0.508512  0.805556  1742551176                None  False   
11  0.528844  0.778333  1742550779                None  False   

    training_iteration     trial_id                 date  time_this_iter_s  \
20                   1  24c4b_00021  2025-03-21_12-58-45        478.772214   
29                   1  24c4b_00031  2025-03-21_13-00-55        313.231522   
44                   1  24c4b_00048  2025-03-21_13

In [30]:
best_result = tuner.get_results().get_best_result(metric="accuracy", mode="max")
best_config = best_result.config  # Лучшая конфигурация гиперпараметров
best_accuracy = best_result.metrics["accuracy"]  # Итоговая точность

print("Лучшие гиперпараметры:", best_config)
print("Лучшая точность (accuracy):", best_accuracy)

Лучшие гиперпараметры: {'learning_rate': 0.001, 'batch_size': 64, 'epochs': 350, 'layers': 75, 'dropout': 0.17023624862631292, 'activation': 'tanh', 'optimizer': 'RMSprop'}
Лучшая точность (accuracy): 0.8277778029441833


## Попробуем GridSearchCV

In [28]:
param_grid = {
    'n_estimators': [10, 50, 100, 150, 200],
    'max_depth': [3, 5, 10, 15]
}

grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train, y_train)

print(grid_search.best_params_)

{'max_depth': 15, 'n_estimators': 100}


In [32]:
print("Лучшие параметры:", grid_search.best_params_)
print("Лучшая точность (accuracy):", grid_search.best_score_)

Лучшие параметры: {'max_depth': 15, 'n_estimators': 100}
Лучшая точность (accuracy): 0.9238055555555555


In [33]:
test_accuracy = grid_search.best_estimator_.score(X_test, y_test)
print("Точность на тестовых данных:", test_accuracy)

Точность на тестовых данных: 0.9247777777777778
