In [8]:
import numpy as np
import pandas as pd
import optuna
import warnings
import tensorflow as tf
import dill
from keras_tuner import RandomSearch
import keras
from sklearn.exceptions import FitFailedWarning
from sklearn.model_selection import KFold, cross_val_score, cross_validate, RandomizedSearchCV, train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.feature_selection import RFE
from sklearn.neural_network import MLPClassifier, MLPRegressor
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

In [3]:
optuna.logging.set_verbosity(optuna.logging.WARNING)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FitFailedWarning)

In [3]:
clf_df = pd.read_csv("../data/asteroids_processed.csv")
reg_df = pd.read_csv("../data/cars_processed.csv")

In [4]:
X_clf = clf_df.drop(['hazardous'], axis=1)
y_clf = clf_df['hazardous']

In [5]:
oversample = SMOTE()
X_clf, y_clf = oversample.fit_resample(X_clf, y_clf)

In [6]:
X_clf_train, X_clf_test, y_clf_train, y_clf_test = train_test_split(X_clf, y_clf, test_size=0.2, random_state=42)

In [9]:
scaler = StandardScaler()
X_clf_train = scaler.fit_transform(X_clf_train)
X_clf_test = scaler.transform(X_clf_test)

In [7]:
X_reg = reg_df.drop(['price_usd'], axis=1)
y_reg = reg_df['price_usd']

In [8]:
cv = KFold(n_splits=5, shuffle=True, random_state=42)

In [9]:
tree = DecisionTreeClassifier().fit(X_clf, y_clf)

In [10]:
rfe = RFE(estimator=tree, n_features_to_select=3, step=1).fit(X_clf, y_clf)
X_clf = pd.DataFrame(rfe.transform(X_clf), columns=rfe.get_feature_names_out())
X_clf.head()

Unnamed: 0,est_diameter_max,relative_velocity,miss_distance
0,35,56014,1024332
1,68,7864,32681860
2,124,55257,65386361
3,43,41531,12607957
4,311,67639,71305897


In [11]:
tree = DecisionTreeRegressor().fit(X_reg, y_reg)

In [12]:
rfe = RFE(estimator=tree, n_features_to_select=13, step=1).fit(X_reg, y_reg)
X_reg = pd.DataFrame(rfe.transform(X_reg), columns=rfe.get_feature_names_out())
X_reg.head()

Unnamed: 0,manufacturer_name,model_name,color,odometer_value,year_produced,engine_capacity,body_type,number_of_photos,up_counter,duration_listed,features_count,engine_type_diesel,drivetrain_front
0,45.0,763.0,8.0,190000.0,2010.0,2.5,10.0,9.0,13.0,16.0,7.0,0.0,0.0
1,45.0,664.0,0.0,280000.0,2001.0,2.5,10.0,14.0,7.0,7.0,4.0,0.0,0.0
2,45.0,664.0,8.0,350000.0,2004.0,2.5,8.0,7.0,29.0,73.0,3.0,0.0,0.0
3,45.0,763.0,4.0,179000.0,2010.0,2.5,10.0,17.0,33.0,87.0,9.0,0.0,0.0
4,45.0,519.0,8.0,571317.0,1999.0,2.5,10.0,8.0,11.0,43.0,4.0,0.0,0.0


In [13]:
clf_table = pd.DataFrame()

In [14]:
def kfold_metrics(results, table, name):
    print('Accuracy:', round(np.mean(results['test_accuracy']), 4))
    print('Precision:', round(np.mean(results['test_precision']), 4))
    print('Recall:', round(np.mean(results['test_recall']), 4))
    print('F1-score:', round(np.mean(results['test_f1']), 2))
    table.loc[name, 'kfold_accuracy'] = round(np.mean(results['test_accuracy']), 4)
    table.loc[name, 'kfold_precision'] = round(np.mean(results['test_precision']), 4)
    table.loc[name, 'kfold_recall'] = round(np.mean(results['test_recall']), 4)
    table.loc[name, 'kfold_F1'] = round(np.mean(results['test_f1']), 2)

In [13]:
neural_model = MLPClassifier(
    activation = "tanh",
    solver = "sgd",
    max_iter = 5000)

In [15]:
parameters = {
    'solver': ['lbfgs', 'adam', 'sgd'],
    'activation': ['logistic', 'tanh', 'relu', 'identity'],
    'alpha': [1e-6, 1e-4, 0.01, 0.05, 0.1]
}

In [16]:
rs = RandomizedSearchCV(MLPClassifier(), parameters, cv=5, n_jobs=-1)
results = cross_validate(rs, X_clf, y_clf, cv=5, scoring=['accuracy', 'precision', 'recall', 'f1', 'roc_auc'])

In [17]:
rs.fit(X_clf_train, y_clf_train)
rs.best_params_

{'solver': 'adam', 'alpha': 0.01, 'activation': 'identity'}

In [18]:
kfold_metrics(results, clf_table, 'MLPClassifier')

Accuracy: 0.6114
Precision: 0.7159
Recall: 0.6824
F1-score: 0.57


In [10]:
X_clf_train = X_clf_train.copy()
y_clf_train = y_clf_train.copy()
X_clf_test = X_clf_test.copy()
y_clf_test = y_clf_test.copy()

In [None]:
def objective(trial):
    solver = trial.suggest_categorical('solver', ['lbfgs', 'adam', 'sgd'])
    activation = trial.suggest_categorical('activation', ['logistic', 'tanh', 'relu', 'identity'])
    alpha = trial.suggest_categorical('alpha', [1e-6, 1e-4, 0.01, 0.05, 0.1])

    model = MLPClassifier(solver=solver, activation=activation, alpha=alpha)
    model.fit(X_clf_train, y_clf_train)
    y_pred = model.predict(X_clf_test)
    return accuracy_score(y_clf_test, y_pred)

In [14]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

best_params = study.best_params
best_model = MLPClassifier(solver=best_params['solver'], activation=best_params['activation'], alpha=best_params['alpha'])
results_opt = cross_validate(best_model, X_clf, y_clf, cv=5, scoring=['accuracy', 'precision', 'recall', 'f1', 'roc_auc'])
best_params

{'solver': 'adam', 'activation': 'relu', 'alpha': 0.0001}

In [15]:
space = {
    'solver': hp.choice('solver', ['adam', 'sgd', 'lbfgs']),
    'activation': hp.choice('activation', ['logistic', 'tanh', 'relu']),
    'alpha': hp.choice('alpha', [1e-6, 1e-4, 0.01, 0.05, 0.1])
}

In [None]:
def objective(params):
    model = MLPClassifier(**params)
    model.fit(X_clf_train, y_clf_train)
    y_pred = model.predict(X_clf_test)
    score = accuracy_score(y_clf_test, y_pred)
    return {'loss': -score, 'status': STATUS_OK}

In [17]:
trials = Trials()
best_params = fmin(objective,
    space=space,
    algo=tpe.suggest,
    max_evals=50,
    trials=trials
)

100%|██████████| 50/50 [7:47:43<00:00, 561.27s/trial, best loss: -0.9165833166633327]   


In [19]:
best_params

{'activation': 2, 'alpha': 3, 'solver': 2}

In [26]:
X_clf.shape

(149966, 3)

In [None]:
model_classification_1 = tf.keras.Sequential(
    [
        tf.keras.layers.Dense(64, activation="relu", input_shape=(3,)),
        tf.keras.layers.Dense(64, activation="relu"),
        tf.keras.layers.Dropout(0.05),
        tf.keras.layers.Dense(32, activation="relu"),
        tf.keras.layers.Dropout(0.05),
        tf.keras.layers.Dense(16, activation="relu"),
        tf.keras.layers.Dropout(0.05),
        tf.keras.layers.Dense(1, activation="sigmoid"),
    ]
)
model_classification_1.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss="binary_crossentropy")
model_classification_1.fit(X_clf_train, y_clf_train, epochs=50)

Epoch 1/25
[1m3750/3750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 4ms/step - loss: 35792.0547
Epoch 2/25
[1m3750/3750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 4ms/step - loss: 15.1941
Epoch 3/25
[1m3750/3750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 4ms/step - loss: 0.7012
Epoch 4/25
[1m3750/3750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 4ms/step - loss: 0.6936
Epoch 5/25
[1m3750/3750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 4ms/step - loss: 5.3267
Epoch 6/25
[1m3750/3750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 4ms/step - loss: 0.6998
Epoch 7/25
[1m3750/3750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 4ms/step - loss: 0.7237
Epoch 8/25
[1m3750/3750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 5ms/step - loss: 0.6933
Epoch 9/25
[1m3750/3750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 7ms/step - loss: 0.8535
Epoch 10/25
[1m3750/3750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

<keras.src.callbacks.history.History at 0x1e3046af740>

In [14]:
def build_model(hp):
    activation = hp.Choice("activation", ['sigmoid', 'tanh', 'relu'])
    
    optimizer_name = hp.Choice("optimizer", ["adam", "rmsprop", "sgd"])
    l2_alpha = hp.Choice("l2", [1e-6, 1e-4, 0.01, 0.05, 0.1])
    lr = hp.Choice("lr", [1e-4, 1e-3, 1e-2])

    model = tf.keras.Sequential()

    model.add(tf.keras.layers.Dense(
        units=hp.Choice("units_0", [8, 16, 32, 64]),
        activation=activation,
        kernel_regularizer=tf.keras.regularizers.l2(l2_alpha),
        input_shape=(3,)
    ))

    model.add(tf.keras.layers.Dense(
        units=hp.Choice("units_1", [8, 16, 32, 64]),
        activation=activation,
        kernel_regularizer=tf.keras.regularizers.l2(l2_alpha)
    ))

    model.add(tf.keras.layers.Dropout(hp.Float("dropout", 0.0, 0.3, step=0.05)))

    model.add(tf.keras.layers.Dense(1, activation="sigmoid"))

    if optimizer_name == "adam":
        optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
    elif optimizer_name == "rmsprop":
        optimizer = tf.keras.optimizers.RMSprop(learning_rate=lr)
    else:
        optimizer = tf.keras.optimizers.SGD(learning_rate=lr)

    model.compile(optimizer=optimizer, loss="binary_crossentropy", metrics=["accuracy"])
    return model


In [25]:
tuner = RandomSearch(
    build_model,
    objective="val_loss",
    max_trials=30,
    directory="my_tuner_runs",
    project_name="clf_search_v1"
)

stop_early = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=5)

tuner.search(X_clf_train, y_clf_train,
             validation_split=0.2,
             epochs=50,
             callbacks=[stop_early],
             verbose=1)

FailedPreconditionError: my_tuner_runs\clf_search_v1 is not a directory

In [23]:
import os

path = 'my_tuner_runs/clf_search_v1'
print(os.path.isfile(path))  # True — это файл
print(os.path.isdir(path))   # True — это папка

False
True


In [None]:
best_model = tuner.get_best_models(num_models=1)[0]
best_hp = tuner.get_best_hyperparameters(num_trials=1)[0]
best_hp

In [20]:
dill.dump_session('session.pkl')

In [4]:
dill.load_session('session.pkl')