In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import shap
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from catboost import CatBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.metrics import f1_score, make_scorer, confusion_matrix, fbeta_score
import optuna
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
import warnings

In [5]:
pr = 0.6
rec = 0.6

In [6]:
(2 * pr * rec) / (pr + rec)

0.6

In [2]:
random_state = 33 
test_size = 0.20 
n_trials = 100 #количество итераций optuna
optuna.logging.set_verbosity(optuna.logging.WARNING) #optuna "тихий" режим
warnings.filterwarnings("ignore") #optuna "тихий" режим

In [3]:
df = pd.read_csv('df.csv')
X = df.drop(['ij_1'], axis=1)
y = df['ij_1']

In [4]:
X_train_valid, X_test, y_train_valid, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

# Logistic Regression

In [6]:
def objective(trial):
# Выбор масштабировщика
    scaler_name = trial.suggest_categorical('scaler', ['StandardScaler', 'MinMaxScaler', 'RobustScaler'])
    if scaler_name == 'StandardScaler':
        scaler = StandardScaler()
    elif scaler_name == 'MinMaxScaler':
        scaler = MinMaxScaler()
    else:                                                                                                                                                              
        scaler = RobustScaler()
    
    # Выбор параметров логистической регрессии
    C = trial.suggest_float('logreg__C', 0.001, 1000)
    penalty = trial.suggest_categorical('logreg__penalty', ['l1', 'l2'])
    solver = trial.suggest_categorical('logreg__solver', ['liblinear'])
    param_smote = {"k_neighbors": trial.suggest_int("k_neighbors", 4, 10)}
    param_threshold = trial.suggest_int("param_threshold", 10, 90)
        
    # Создание пайплайна
    pipeline = Pipeline([
        ('scaler', scaler),
        ('logreg', LogisticRegression(C=C, penalty=penalty, solver=solver, max_iter=10000, random_state=random_state))
    ])
    
    # Оценка качества модели с помощью кросс-валидации
    
    
    skf = StratifiedKFold(random_state=random_state, n_splits=5, shuffle=True)
    oversample = SMOTE(random_state=random_state, **param_smote)
    param_threshold_ = param_threshold
    f1_scores = []
    for i, (train_index, valid_index) in enumerate(skf.split(X_train_valid, y_train_valid)):
        X_train_oversampled, y_train_oversampled = oversample.fit_resample(X_train_valid.iloc[train_index], y_train_valid.iloc[train_index])
        pipeline.fit(X_train_oversampled, y_train_oversampled)
        pred_prob = pipeline.predict_proba(X_train_valid.iloc[valid_index])[:, 1]
        f1_scores.append(f1_score(y_train_valid.iloc[valid_index], pred_prob > param_threshold_/100))
    return np.mean(f1_scores)

# Настройка и запуск оптимизации
sampler = optuna.samplers.TPESampler(seed=random_state)
study = optuna.create_study(direction='maximize', sampler=sampler)
study.optimize(objective, n_trials=n_trials, show_progress_bar=True)

# Вывод лучших параметров
print(f"Лучшие параметры_LogReg: {study.best_params}")
print(f"Лучший скор на valid_LogReg: {study.best_value}")
# Обучение и оценка модели на тестовой выборке с использованием лучших параметров
best_params_LogReg = study.best_params
best_score_LogReg = study.best_value

# Воссоздание наилучшего масштабировщика
if best_params_LogReg['scaler'] == 'StandardScaler':
    best_scaler = StandardScaler()
elif best_params_LogReg['scaler'] == 'MinMaxScaler':
    best_scaler = MinMaxScaler()
else:
    best_scaler = RobustScaler()

# Воссоздание наилучшей модели
best_model_LogReg = Pipeline([
    ('scaler', best_scaler),
    ('logreg', LogisticRegression(
        C=best_params_LogReg['logreg__C'],
        penalty=best_params_LogReg['logreg__penalty'],
        solver=best_params_LogReg['logreg__solver'],
        max_iter=10000
    ))
])

oversample = SMOTE(random_state=random_state, k_neighbors=best_params_LogReg['k_neighbors'])
X_train_oversampled_fin, y_train_oversampled_fin = oversample.fit_resample(X_train_valid, y_train_valid)
best_model_LogReg.fit(X_train_oversampled_fin, y_train_oversampled_fin)
f1_LogReg = f1_score(y_test, (best_model_LogReg.predict_proba(X_test)[:, 1] > best_params_LogReg['param_threshold']/100).astype(int))
tn_LogReg, fp_LogReg, fn_LogReg, tp_LogReg = confusion_matrix(y_test, (best_model_LogReg.predict_proba(X_test)[:, 1] > best_params_LogReg['param_threshold']/100).astype(int)).ravel()
recall_LorReg = recall_score(y_test, (best_model_LogReg.predict_proba(X_test)[:, 1] > best_params_LogReg['param_threshold']/100).astype(int))
precision_LorReg = recall_score(y_test, (best_model_LogReg.predict_proba(X_test)[:, 1] > best_params_LogReg['param_threshold']/100).astype(int))

  0%|          | 0/100 [00:00<?, ?it/s]

Лучшие параметры_LogReg: {'scaler': 'RobustScaler', 'logreg__C': 329.71976414449614, 'logreg__penalty': 'l1', 'logreg__solver': 'liblinear', 'k_neighbors': 4, 'param_threshold': 13}
Лучший скор на valid_LogReg: 0.39761904761904765


# KNN


In [8]:
def objective(trial):
# Выбор масштабировщика
    scaler_name = trial.suggest_categorical('scaler', ['StandardScaler', 'MinMaxScaler', 'RobustScaler'])
    if scaler_name == 'StandardScaler':
        scaler = StandardScaler()
    elif scaler_name == 'MinMaxScaler':
        scaler = MinMaxScaler()
    else:
        scaler = RobustScaler()
    
    # Выбор параметров логистической регрессии
    n_neighbors = trial.suggest_int("n_neighbors", 1, 15)
    weights = trial.suggest_categorical("weights", ['uniform', 'distance'])
    metric = trial.suggest_categorical("metric", ['euclidean', 'manhattan', 'minkowski'])
    param_smote = {"k_neighbors": trial.suggest_int("k_neighbors", 4, 10)}
    param_threshold = trial.suggest_int("param_threshold", 10, 90)
        
    # Создание пайплайна
    pipeline = Pipeline([
        ('scaler', scaler),
        ('knn', KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, metric=metric))
    ])
    
    # Оценка качества модели с помощью кросс-валидации
    
    
    skf = StratifiedKFold(random_state=random_state, n_splits=5, shuffle=True)
    oversample = SMOTE(random_state=random_state, **param_smote)
    param_threshold_ = param_threshold
    f1_scores = []
    for i, (train_index, valid_index) in enumerate(skf.split(X_train_valid, y_train_valid)):
        
        X_train_oversampled, y_train_oversampled = oversample.fit_resample(X_train_valid.iloc[train_index], y_train_valid.iloc[train_index])
        pipeline.fit(X_train_oversampled, y_train_oversampled)
        pred_prob = pipeline.predict_proba(X_train_valid.iloc[valid_index])[:, 1]
        f1_scores.append(f1_score(y_train_valid.iloc[valid_index], pred_prob > param_threshold_/100))
    return np.mean(f1_scores)

# Настройка и запуск оптимизации
sampler = optuna.samplers.TPESampler(seed=random_state)
study = optuna.create_study(direction='maximize', sampler=sampler)
study.optimize(objective, n_trials=n_trials, show_progress_bar=True)

# Вывод лучших параметров
print(f"Лучшие параметры_KNN: {study.best_params}")
print(f"Лучший скор на valid_KNN: {study.best_value}")
# Обучение и оценка модели на тестовой выборке с использованием лучших параметров
best_params_KNN = study.best_params
best_score_KNN = study.best_value

# Воссоздание наилучшего масштабировщика
if best_params_KNN['scaler'] == 'StandardScaler':
    best_scaler = StandardScaler()
elif best_params_KNN['scaler'] == 'MinMaxScaler':
    best_scaler = MinMaxScaler()
else:
    best_scaler = RobustScaler()

# Воссоздание наилучшей модели
best_model_KNN = Pipeline([
    ('scaler', best_scaler),
    ('knn', KNeighborsClassifier(
        n_neighbors=best_params_KNN['n_neighbors'],
        weights=best_params_KNN['weights'],
        metric=best_params_KNN['metric']
    ))
])

oversample = SMOTE(random_state=random_state, k_neighbors=best_params_KNN['k_neighbors'])
X_train_oversampled_fin, y_train_oversampled_fin = oversample.fit_resample(X_train_valid, y_train_valid)
best_model_KNN.fit(X_train_oversampled_fin, y_train_oversampled_fin)
f1_KNN = f1_score(y_test, (best_model_KNN.predict_proba(X_test)[:, 1] > best_params_KNN['param_threshold']/100).astype(int))
tn_KNN, fp_KNN, fn_KNN, tp_KNN = confusion_matrix(y_test, (best_model_KNN.predict_proba(X_test)[:, 1] > best_params_KNN['param_threshold']/100).astype(int)).ravel()
recall_KNN = recall_score(y_test, (best_model_KNN.predict_proba(X_test)[:, 1] > best_params_KNN['param_threshold']/100).astype(int))
precision_KNN = recall_score(y_test, (best_model_KNN.predict_proba(X_test)[:, 1] > best_params_KNN['param_threshold']/100).astype(int))

  0%|          | 0/100 [00:00<?, ?it/s]

Лучшие параметры_KNN: {'scaler': 'StandardScaler', 'n_neighbors': 15, 'weights': 'distance', 'metric': 'minkowski', 'k_neighbors': 10, 'param_threshold': 79}
Лучший скор на valid_KNN: 0.3354710305174701


# Catboost

In [9]:
def objective(trial):

    
    param = {
        "iterations": trial.suggest_int("iterations", 100, 1000), 
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
        "depth": trial.suggest_int("depth", 4, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.5)}      
       
    
    
    param_smote = {"k_neighbors": trial.suggest_int("k_neighbors", 4, 10)}
    param_threshold = trial.suggest_int("param_threshold", 10, 90)
      
    
    # Создание пайплайна
    pipeline = CatBoostClassifier(
            **param,
            verbose=0,  #task_type="GPU", devices='0',
            random_state=random_state, early_stopping_rounds=150, eval_metric='F1')
    skf = StratifiedKFold(random_state=random_state, n_splits=5, shuffle=True)
    
    oversample = SMOTE(random_state=random_state, **param_smote)
    param_threshold_ = param_threshold
    f1_scores = []
    for i, (train_index, valid_index) in enumerate(skf.split(X_train_valid, y_train_valid)):
        
        X_train_oversampled, y_train_oversampled = oversample.fit_resample(X_train_valid.iloc[train_index], y_train_valid.iloc[train_index])
        pipeline.fit(X_train_oversampled, y_train_oversampled)
        pred_prob = pipeline.predict_proba(X_train_valid.iloc[valid_index])[:, 1]
        f1_scores.append(f1_score(y_train_valid.iloc[valid_index], pred_prob > param_threshold_/100))
    return np.mean(f1_scores)

# Настройка и запуск оптимизации
sampler = optuna.samplers.TPESampler(seed=random_state)
study = optuna.create_study(direction='maximize', sampler=sampler)
study.optimize(objective, n_trials=n_trials, show_progress_bar=True)

# Вывод лучших параметров
print(f"Лучшие параметры_catboost: {study.best_params}")
print(f"Лучший скор на valid_catboost: {study.best_value}")

# Обучение и оценка модели на тестовой выборке с использованием лучших параметров
best_params_catboost = study.best_params
best_score_catboost = study.best_value

best_model_catboost = CatBoostClassifier(
    verbose=0,  #task_type="GPU", devices='0',
    random_state=random_state,
    iterations=best_params_catboost['iterations'], 
    min_data_in_leaf=best_params_catboost['min_data_in_leaf'], 
    depth=best_params_catboost['depth'], 
    learning_rate=best_params_catboost['learning_rate']) 
oversample = SMOTE(random_state=random_state, k_neighbors=best_params_catboost['k_neighbors'])
X_train_oversampled_fin, y_train_oversampled_fin = oversample.fit_resample(X_train_valid, y_train_valid)
best_model_catboost.fit(X_train_oversampled_fin, y_train_oversampled_fin)
f1_catboost = f1_score(y_test, (best_model_catboost.predict_proba(X_test)[:, 1] > best_params_catboost['param_threshold']/100).astype(int))
tn_catboost, fp_catboost, fn_catboost, tp_catboost = confusion_matrix(y_test, (best_model_catboost.predict_proba(X_test)[:, 1] > best_params_catboost['param_threshold']/100).astype(int)).ravel()
recall_catboost = recall_score(y_test, (best_model_catboost.predict_proba(X_test)[:, 1] > best_params_catboost['param_threshold']/100).astype(int))
precision_catboost = recall_score(y_test, (best_model_catboost.predict_proba(X_test)[:, 1] > best_params_catboost['param_threshold']/100).astype(int))

  0%|          | 0/100 [00:00<?, ?it/s]

[33m[W 2024-08-28 19:18:07,241][0m Trial 5 failed with parameters: {'iterations': 974, 'min_data_in_leaf': 83, 'depth': 7, 'learning_rate': 0.16640688804229012, 'k_neighbors': 6, 'param_threshold': 16} because of the following error: KeyboardInterrupt('').[0m
Traceback (most recent call last):
  File "C:\Users\s.iglin\AppData\Local\Programs\Python\Python310\lib\site-packages\optuna\study\_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\s.iglin\AppData\Local\Temp\ipykernel_7556\1990626920.py", line 29, in objective
    pipeline.fit(X_train_oversampled, y_train_oversampled)
  File "C:\Users\s.iglin\AppData\Local\Programs\Python\Python310\lib\site-packages\catboost\core.py", line 5201, in fit
    self._fit(X, y, cat_features, text_features, embedding_features, None, sample_weight, None, None, None, None, baseline, use_best_model,
  File "C:\Users\s.iglin\AppData\Local\Programs\Python\Python310\lib\site-packages\catboost\core.py", line 2396, in _


KeyboardInterrupt



# MLP

In [32]:
def objective(trial):
# Выбор масштабировщика
    scaler_name = trial.suggest_categorical('scaler', ['StandardScaler', 'MinMaxScaler', 'RobustScaler'])
    if scaler_name == 'StandardScaler':
        scaler = StandardScaler()
    elif scaler_name == 'MinMaxScaler':
        scaler = MinMaxScaler()
    else:
        scaler = RobustScaler()
    
    # Выбор параметров многослойного перцептрона
    hidden_layer_sizes = trial.suggest_int("hidden_layer_sizes", 32, 512)
    activation = trial.suggest_categorical('activation', ['identity', 'logistic', 'tanh', 'relu'])
    alpha = trial.suggest_float('alpha', 1e-5, 1e-2)
    learning_rate_init = trial.suggest_float('learning_rate_init', 1e-4, 1e-2)
    param_smote = {"k_neighbors": trial.suggest_int("k_neighbors", 4, 10)}
    param_threshold = trial.suggest_int("param_threshold", 10, 90)
        
    # Создание пайплайна
    pipeline = Pipeline([
        ('scaler', scaler),
        ('MLP', MLPClassifier(random_state=random_state, max_iter=3000, 
                              hidden_layer_sizes=hidden_layer_sizes, activation=activation,
            alpha=alpha,  learning_rate_init=learning_rate_init))
    ])
    
    # Оценка качества модели с помощью кросс-валидации
    
    
    skf = StratifiedKFold(random_state=random_state, n_splits=5, shuffle=True)
    
    oversample = SMOTE(random_state=random_state, **param_smote)
    f1_scores = []
    param_threshold_ = param_threshold
    for i, (train_index, valid_index) in enumerate(skf.split(X_train_valid, y_train_valid)):
        
        X_train_oversampled, y_train_oversampled = oversample.fit_resample(X_train_valid.iloc[train_index], y_train_valid.iloc[train_index])
        pipeline.fit(X_train_oversampled, y_train_oversampled)
        pred_prob = pipeline.predict_proba(X_train_valid.iloc[valid_index])[:, 1]
        f1_scores.append(f1_score(y_train_valid.iloc[valid_index], pred_prob > param_threshold_/100))
    return np.mean(f1_scores)

# Настройка и запуск оптимизации
sampler = optuna.samplers.TPESampler(seed=random_state)
study = optuna.create_study(direction='maximize', sampler=sampler)
study.optimize(objective, n_trials=n_trials, show_progress_bar=True)

# Вывод лучших параметров
print(f"Лучшие параметры_mlp: {study.best_params}")
print(f"Лучший скор на valid_mlp: {study.best_value}")

# Обучение и оценка модели на тестовой выборке с использованием лучших параметров
best_params_mlp = study.best_params
best_score_mlp = study.best_value

# Воссоздание наилучшего масштабировщика
if best_params_mlp['scaler'] == 'StandardScaler':
    best_scaler = StandardScaler()
elif best_params_mlp['scaler'] == 'MinMaxScaler':
    best_scaler = MinMaxScaler()
else:
    best_scaler = RobustScaler()

# Воссоздание наилучшей модели
best_model_mlp = Pipeline([
    ('scaler', best_scaler),
    ('MLP', MLPClassifier(
        
        hidden_layer_sizes=best_params_mlp['hidden_layer_sizes'],
        activation=best_params_mlp['activation'],
        alpha=best_params_mlp['alpha'],
        learning_rate_init=best_params_mlp['learning_rate_init'],
        random_state=random_state,
        max_iter=3000
    ))
])
oversample = SMOTE(random_state=random_state, k_neighbors=best_params_mlp['k_neighbors'])
X_train_oversampled_fin, y_train_oversampled_fin = oversample.fit_resample(X_train_valid, y_train_valid)
best_model_mlp.fit(X_train_oversampled_fin, y_train_oversampled_fin)
f1_mlp = f1_score(y_test, (best_model_mlp.predict_proba(X_test)[:, 1] > best_params_mlp['param_threshold']/100).astype(int))
tn_mlp, fp_mlp, fn_mlp, tp_mlp = confusion_matrix(y_test, (best_model_mlp.predict_proba(X_test)[:, 1] > best_params_mlp['param_threshold']/100).astype(int)).ravel()
recall_mlp = recall_score(y_test, (best_model_mlp.predict_proba(X_test)[:, 1] > best_params_mlp['param_threshold']/100).astype(int))
precision_mlp = recall_score(y_test, (best_model_mlp.predict_proba(X_test)[:, 1] > best_params_mlp['param_threshold']/100).astype(int))

  0%|          | 0/100 [00:00<?, ?it/s]

Лучшие параметры_mlp: {'scaler': 'RobustScaler', 'hidden_layer_sizes': 66, 'activation': 'relu', 'alpha': 0.009992552310447626, 'learning_rate_init': 0.009866731390762627, 'k_neighbors': 8, 'param_threshold': 25}
Лучший скор на valid_mlp: 0.38392156862745097


In [33]:
models = pd.DataFrame({'model': ['LogisticRegression', 'KNN', 'CatBoost', 'MLP'], 
              'F1_score_test': [f1_LogReg, f1_KNN, f1_catboost, f1_mlp],
              'F1_score_valid': [best_score_LogReg, best_score_KNN, best_score_catboost, best_score_mlp],        
              'Recall': [recall_LorReg, recall_KNN, recall_catboost, recall_mlp],
              'Precision': [precision_LorReg, precision_KNN, precision_catboost, precision_mlp],
              'TN': [tn_LogReg, tn_KNN, tn_catboost, tn_mlp],
              'FP': [fp_LogReg, fp_KNN, fp_catboost, fp_mlp],
              'FN': [fn_LogReg, fn_KNN, fn_catboost, fn_mlp],
              'TP': [tp_LogReg, tp_KNN, tp_catboost, tp_mlp],
              'prob_threshold': [best_params_LogReg['param_threshold'], best_params_KNN['param_threshold'], best_params_catboost['param_threshold'], best_params_mlp['param_threshold']]})

In [34]:
models.sort_values('F1_score_test')

Unnamed: 0,model,F1_score_test,F1_score_valid,Recall,Precision,TN,FP,FN,TP,prob_threshold
2,CatBoost,0.25,0.398889,0.2,0.2,226,2,4,1,36
3,MLP,0.333333,0.383922,0.2,0.2,228,0,4,1,25
1,KNN,0.352941,0.36735,0.6,0.6,219,9,2,3,81
0,LogisticRegression,0.727273,0.396681,0.8,0.8,226,2,1,4,70
