## Ансамбли и полносвязные нейронные сети
В этом ноутбуке вам нужно обучить модели на датасете классификации из предыдущего ноутбука и сравнить результаты. Вам будет предоставлен baseline, на основе которого вы будете доделывать предсказывающие модели. Оценка лабы будет зависеть от ROC-AUC на тестовых данных по следующим критериям:
\
AUC - на тестовых данных
- $AUC \leq 0.76$ - 0 баллов
- $0.76 < AUC \leq 0.77$ - 2 балла
- $0.77 < AUC \leq 0.78$ - 4 балла
- $0.78 < AUC \leq 0.79$ - 6 баллов
- $0.79 < AUC \leq 0.80$ - 8 баллов
- $AUC > 0.80$ - 10 баллов


## Экспериментируйте
Для получения лучшего качества придется поэкспериментировать. Подсказка: попробуйте оптимизировать гиперпараметры модели

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_auc_score, precision_score, recall_score, accuracy_score
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Загрузка данных
data = pd.read_csv('german.csv', sep=';')
X = data.iloc[:, 1:].to_numpy()
y = data.iloc[:, 0].to_numpy()

# Разделение данных
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Сетка гиперпараметров для RandomForest
rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Сетка гиперпараметров для GradientBoosting
gb_param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Сетка гиперпараметров для MLPClassifier
mlp_param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'sgd'],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate': ['constant', 'adaptive']
}


In [None]:
# RandomizedSearchCV для RandomForest
rf_random_search = RandomizedSearchCV(estimator=RandomForestClassifier(random_state=42),
                                      param_distributions=rf_param_grid, n_iter=20, cv=3,
                                      scoring='roc_auc', random_state=42)
rf_random_search.fit(X_train, y_train)
best_rf = rf_random_search.best_estimator_

# RandomizedSearchCV для GradientBoosting
gb_random_search = RandomizedSearchCV(estimator=GradientBoostingClassifier(random_state=42),
                                      param_distributions=gb_param_grid, n_iter=20, cv=3,
                                      scoring='roc_auc', random_state=42)
gb_random_search.fit(X_train, y_train)
best_gb = gb_random_search.best_estimator_

# RandomizedSearchCV для MLPClassifier
mlp_random_search = RandomizedSearchCV(estimator=MLPClassifier(max_iter=500, random_state=42),
                                       param_distributions=mlp_param_grid, n_iter=20, cv=3,
                                       scoring='roc_auc', random_state=42)
mlp_random_search.fit(X_train, y_train)
best_mlp = mlp_random_search.best_estimator_


In [None]:
# Оценка Random Forest
rf_pred_opt = best_rf.predict(X_test)
rf_roc_auc_opt = roc_auc_score(y_test, rf_pred_opt)
rf_accuracy_opt = accuracy_score(y_test, rf_pred_opt)
rf_precision_opt = precision_score(y_test, rf_pred_opt)
rf_recall_opt = recall_score(y_test, rf_pred_opt)

print("Оптимизированные метрики Random Forest:")
print(f"ROC AUC: {rf_roc_auc_opt:.2f}")
print(f"Accuracy: {rf_accuracy_opt:.2f}")
print(f"Precision: {rf_precision_opt:.2f}")
print(f"Recall: {rf_recall_opt:.2f}")

# Оценка Gradient Boosting
gb_pred_opt = best_gb.predict(X_test)
gb_roc_auc_opt = roc_auc_score(y_test, gb_pred_opt)
gb_accuracy_opt = accuracy_score(y_test, gb_pred_opt)
gb_precision_opt = precision_score(y_test, gb_pred_opt)
gb_recall_opt = recall_score(y_test, gb_pred_opt)

print("\nОптимизированные метрики Gradient Boosting:")
print(f"ROC AUC: {gb_roc_auc_opt:.2f}")
print(f"Accuracy: {gb_accuracy_opt:.2f}")
print(f"Precision: {gb_precision_opt:.2f}")
print(f"Recall: {gb_recall_opt:.2f}")

# Оценка MLP
mlp_pred_opt = best_mlp.predict(X_test)
mlp_roc_auc_opt = roc_auc_score(y_test, mlp_pred_opt)
mlp_accuracy_opt = accuracy_score(y_test, mlp_pred_opt)
mlp_precision_opt = precision_score(y_test, mlp_pred_opt)
mlp_recall_opt = recall_score(y_test, mlp_pred_opt)

print("\nОптимизированные метрики MLP (Neural Network):")
print(f"ROC AUC: {mlp_roc_auc_opt:.2f}")
print(f"Accuracy: {mlp_accuracy_opt:.2f}")
print(f"Precision: {mlp_precision_opt:.2f}")
print(f"Recall: {mlp_recall_opt:.2f}")


Оптимизированные метрики Random Forest:
ROC AUC: 0.68
Accuracy: 0.76
Precision: 0.78
Recall: 0.90

Оптимизированные метрики Gradient Boosting:
ROC AUC: 0.70
Accuracy: 0.76
Precision: 0.81
Recall: 0.86

Оптимизированные метрики MLP (Neural Network):
ROC AUC: 0.52
Accuracy: 0.70
Precision: 0.70
Recall: 0.99


In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

# Объединяем модели в ансамбль
estimators = [
    ('rf', best_rf),
    ('gb', best_gb),
    ('mlp', best_mlp)
]

# Модель для ансамбля
stacked_model = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(max_iter=1000, random_state=42),
    cv=5
)

# Обучение ансамбля
stacked_model.fit(X_train, y_train)

# Оценка ансамбля
stacked_pred = stacked_model.predict(X_test)
stacked_roc_auc = roc_auc_score(y_test, stacked_pred)
stacked_accuracy = accuracy_score(y_test, stacked_pred)
stacked_precision = precision_score(y_test, stacked_pred)
stacked_recall = recall_score(y_test, stacked_pred)

print("Ансамбль (Stacking) метрики:")
print(f"ROC AUC: {stacked_roc_auc:.2f}")
print(f"Accuracy: {stacked_accuracy:.2f}")
print(f"Precision: {stacked_precision:.2f}")
print(f"Recall: {stacked_recall:.2f}")


Ансамбль (Stacking) метрики:
ROC AUC: 0.69
Accuracy: 0.77
Precision: 0.80
Recall: 0.88


In [None]:
# Глубокая MLP модель
deep_mlp_model = MLPClassifier(
    hidden_layer_sizes=(128, 64, 32, 16),  # Больше слоев и нейронов
    activation='relu',
    solver='adam',
    alpha=0.0001,  # Регуляризация
    learning_rate='adaptive',
    max_iter=1000,
    random_state=42
)

# Обучение глубокой MLP модели
deep_mlp_model.fit(X_train, y_train)

# Оценка глубокой MLP модели
deep_mlp_pred = deep_mlp_model.predict(X_test)
deep_mlp_roc_auc = roc_auc_score(y_test, deep_mlp_pred)
deep_mlp_accuracy = accuracy_score(y_test, deep_mlp_pred)
deep_mlp_precision = precision_score(y_test, deep_mlp_pred)
deep_mlp_recall = recall_score(y_test, deep_mlp_pred)

print("\nГлубокая MLP метрики:")
print(f"ROC AUC: {deep_mlp_roc_auc:.2f}")
print(f"Accuracy: {deep_mlp_accuracy:.2f}")
print(f"Precision: {deep_mlp_precision:.2f}")
print(f"Recall: {deep_mlp_recall:.2f}")



Глубокая MLP метрики:
ROC AUC: 0.50
Accuracy: 0.69
Precision: 0.69
Recall: 1.00


In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [None]:
from catboost import CatBoostClassifier

# Настройка и обучение CatBoost
catboost_model = CatBoostClassifier(
    iterations=500,  # Больше итераций для повышения качества
    depth=6,         # Глубина деревьев
    learning_rate=0.1,
    loss_function='Logloss',
    eval_metric='AUC',
    random_seed=42,
    verbose=100
)

catboost_model.fit(X_train, y_train, eval_set=(X_test, y_test), use_best_model=True)

# Оценка CatBoost
catboost_pred = catboost_model.predict(X_test)
catboost_roc_auc = roc_auc_score(y_test, catboost_pred)
catboost_accuracy = accuracy_score(y_test, catboost_pred)
catboost_precision = precision_score(y_test, catboost_pred)
catboost_recall = recall_score(y_test, catboost_pred)

print("\nCatBoost метрики:")
print(f"ROC AUC: {catboost_roc_auc:.2f}")
print(f"Accuracy: {catboost_accuracy:.2f}")
print(f"Precision: {catboost_precision:.2f}")
print(f"Recall: {catboost_recall:.2f}")


0:	test: 0.7086840	best: 0.7086840 (0)	total: 60.5ms	remaining: 30.2s
100:	test: 0.7688172	best: 0.7844787 (11)	total: 419ms	remaining: 1.65s
200:	test: 0.7607527	best: 0.7844787 (11)	total: 739ms	remaining: 1.1s
300:	test: 0.7536232	best: 0.7844787 (11)	total: 1.03s	remaining: 679ms
400:	test: 0.7482468	best: 0.7844787 (11)	total: 1.44s	remaining: 354ms
499:	test: 0.7459093	best: 0.7844787 (11)	total: 1.69s	remaining: 0us

bestTest = 0.7844787284
bestIteration = 11

Shrink model to first 12 iterations.

CatBoost метрики:
ROC AUC: 0.64
Accuracy: 0.75
Precision: 0.76
Recall: 0.93


In [None]:
from sklearn.ensemble import VotingClassifier

# Voting Classifier с комбинацией нескольких моделей
voting_model = VotingClassifier(
    estimators=[
        ('rf', best_rf),
        ('gb', best_gb),
        ('catboost', catboost_model)
    ],
    voting='soft'  # Используем вероятности, а не классы
)

# Обучение Voting Classifier
voting_model.fit(X_train, y_train)

# Оценка Voting Classifier
voting_pred = voting_model.predict(X_test)
voting_roc_auc = roc_auc_score(y_test, voting_pred)
voting_accuracy = accuracy_score(y_test, voting_pred)
voting_precision = precision_score(y_test, voting_pred)
voting_recall = recall_score(y_test, voting_pred)

print("\nVoting Classifier метрики:")
print(f"ROC AUC: {voting_roc_auc:.2f}")
print(f"Accuracy: {voting_accuracy:.2f}")
print(f"Precision: {voting_precision:.2f}")
print(f"Recall: {voting_recall:.2f}")


0:	total: 1.77ms	remaining: 884ms
100:	total: 657ms	remaining: 2.6s
200:	total: 1.61s	remaining: 2.39s
300:	total: 2.23s	remaining: 1.47s
400:	total: 2.99s	remaining: 737ms
499:	total: 3.87s	remaining: 0us

Voting Classifier метрики:
ROC AUC: 0.70
Accuracy: 0.76
Precision: 0.81
Recall: 0.85


In [None]:
from catboost import CatBoostClassifier

# Настройка гиперпараметров CatBoost с более высокой точностью
catboost_optimized = CatBoostClassifier(
    iterations=1000,           # Увеличим количество итераций для лучшего результата
    depth=8,                   # Глубина деревьев
    learning_rate=0.05,        # Меньшая скорость обучения для более точной подгонки
    l2_leaf_reg=10,            # Регуляризация для предотвращения переобучения
    loss_function='Logloss',
    eval_metric='AUC',
    random_seed=42,
    verbose=100
)

# Обучение оптимизированного CatBoost
catboost_optimized.fit(X_train, y_train, eval_set=(X_test, y_test), use_best_model=True)

# Оценка
catboost_optimized_pred = catboost_optimized.predict(X_test)
catboost_optimized_roc_auc = roc_auc_score(y_test, catboost_optimized_pred)
catboost_optimized_accuracy = accuracy_score(y_test, catboost_optimized_pred)
catboost_optimized_precision = precision_score(y_test, catboost_optimized_pred)
catboost_optimized_recall = recall_score(y_test, catboost_optimized_pred)

print("\nОптимизированные метрики CatBoost:")
print(f"ROC AUC: {catboost_optimized_roc_auc:.2f}")
print(f"Accuracy: {catboost_optimized_accuracy:.2f}")
print(f"Precision: {catboost_optimized_precision:.2f}")
print(f"Recall: {catboost_optimized_recall:.2f}")


0:	test: 0.7162225	best: 0.7162225 (0)	total: 10.2ms	remaining: 10.2s
100:	test: 0.7899719	best: 0.8014259 (13)	total: 454ms	remaining: 4.04s
200:	test: 0.7833100	best: 0.8014259 (13)	total: 1.08s	remaining: 4.28s
300:	test: 0.7725573	best: 0.8014259 (13)	total: 1.46s	remaining: 3.4s
400:	test: 0.7677653	best: 0.8014259 (13)	total: 1.85s	remaining: 2.76s
500:	test: 0.7658953	best: 0.8014259 (13)	total: 2.27s	remaining: 2.26s
600:	test: 0.7640252	best: 0.8014259 (13)	total: 2.65s	remaining: 1.76s
700:	test: 0.7633240	best: 0.8014259 (13)	total: 3.28s	remaining: 1.4s
800:	test: 0.7628565	best: 0.8014259 (13)	total: 3.93s	remaining: 976ms
900:	test: 0.7607527	best: 0.8014259 (13)	total: 4.8s	remaining: 528ms
999:	test: 0.7612202	best: 0.8014259 (13)	total: 5.08s	remaining: 0us

bestTest = 0.8014259
bestIteration = 13

Shrink model to first 14 iterations.

Оптимизированные метрики CatBoost:
ROC AUC: 0.62
Accuracy: 0.74
Precision: 0.75
Recall: 0.93


In [None]:
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Настройка моделей XGBoost и LightGBM
xgb_model = XGBClassifier(n_estimators=200, learning_rate=0.05, max_depth=6, random_state=42)
lgbm_model = LGBMClassifier(n_estimators=200, learning_rate=0.05, max_depth=6, random_state=42)

# Объединяем модели в Voting Classifier
voting_boosting_model = VotingClassifier(
    estimators=[
        ('catboost', catboost_optimized),
        ('xgb', xgb_model),
        ('lgbm', lgbm_model)
    ],
    voting='soft'
)

# Обучение Voting Classifier
voting_boosting_model.fit(X_train, y_train)

# Оценка Voting Classifier
voting_boosting_pred = voting_boosting_model.predict(X_test)
voting_boosting_roc_auc = roc_auc_score(y_test, voting_boosting_pred)
voting_boosting_accuracy = accuracy_score(y_test, voting_boosting_pred)
voting_boosting_precision = precision_score(y_test, voting_boosting_pred)
voting_boosting_recall = recall_score(y_test, voting_boosting_pred)

print("\nVoting Classifier (CatBoost, XGBoost, LightGBM) метрики:")
print(f"ROC AUC: {voting_boosting_roc_auc:.2f}")
print(f"Accuracy: {voting_boosting_accuracy:.2f}")
print(f"Precision: {voting_boosting_precision:.2f}")
print(f"Recall: {voting_boosting_recall:.2f}")


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



0:	total: 3.71ms	remaining: 3.71s
100:	total: 1.71s	remaining: 15.3s
200:	total: 2.55s	remaining: 10.1s
300:	total: 3.36s	remaining: 7.81s
400:	total: 4.11s	remaining: 6.14s
500:	total: 4.62s	remaining: 4.6s
600:	total: 5.19s	remaining: 3.45s
700:	total: 5.44s	remaining: 2.32s
800:	total: 5.71s	remaining: 1.42s
900:	total: 5.96s	remaining: 655ms
999:	total: 6.2s	remaining: 0us
[LightGBM] [Info] Number of positive: 562, number of negative: 238
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000693 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 412
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.702500 -> initscore=0.859231
[LightGBM] [Info] Start training from score 0.859231

Voting Classifier (CatBoost, XGBoost, LightGBM) метрики:
ROC AUC: 0.71

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler

# Стандартизация данных
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Создание модели
deep_nn_model = Sequential()
deep_nn_model.add(Dense(256, activation='relu', input_shape=(X_train.shape[1],)))
deep_nn_model.add(Dropout(0.3))
deep_nn_model.add(Dense(128, activation='relu'))
deep_nn_model.add(Dropout(0.3))
deep_nn_model.add(Dense(64, activation='relu'))
deep_nn_model.add(Dropout(0.3))
deep_nn_model.add(Dense(1, activation='sigmoid'))

# Компиляция модели
deep_nn_model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['AUC'])

# Обучение модели
history = deep_nn_model.fit(X_train_scaled, y_train, epochs=50, batch_size=32, validation_split=0.2, verbose=1)

# Оценка на тестовой выборке
nn_pred = (deep_nn_model.predict(X_test_scaled) > 0.5).astype("int32")
nn_roc_auc = roc_auc_score(y_test, nn_pred)
nn_accuracy = accuracy_score(y_test, nn_pred)
nn_precision = precision_score(y_test, nn_pred)
nn_recall = recall_score(y_test, nn_pred)

print("\nГлубокая нейронная сеть (Keras) метрики:")
print(f"ROC AUC: {nn_roc_auc:.2f}")
print(f"Accuracy: {nn_accuracy:.2f}")
print(f"Precision: {nn_precision:.2f}")
print(f"Recall: {nn_recall:.2f}")


Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 20ms/step - AUC: 0.5323 - loss: 0.6693 - val_AUC: 0.7012 - val_loss: 0.5496
Epoch 2/50
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - AUC: 0.6975 - loss: 0.5361 - val_AUC: 0.7457 - val_loss: 0.5166
Epoch 3/50
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - AUC: 0.7953 - loss: 0.5175 - val_AUC: 0.7387 - val_loss: 0.5016
Epoch 4/50
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - AUC: 0.7889 - loss: 0.5002 - val_AUC: 0.7363 - val_loss: 0.5069
Epoch 5/50
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - AUC: 0.8335 - loss: 0.4666 - val_AUC: 0.7362 - val_loss: 0.5108
Epoch 6/50
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - AUC: 0.8273 - loss: 0.4460 - val_AUC: 0.7312 - val_loss: 0.5186
Epoch 7/50
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - AUC: 0.8393 - loss: 0.44

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Оптимизация параметров моделей
catboost_optimized = CatBoostClassifier(
    iterations=1200, depth=8, learning_rate=0.03, l2_leaf_reg=15, loss_function='Logloss',
    eval_metric='AUC', random_seed=42, verbose=100)

xgb_optimized = XGBClassifier(n_estimators=300, learning_rate=0.03, max_depth=7, reg_lambda=1.5, random_state=42)
lgbm_optimized = LGBMClassifier(n_estimators=300, learning_rate=0.03, max_depth=7, reg_lambda=1.5, random_state=42)

# Добавляем RandomForest
rf_optimized = RandomForestClassifier(n_estimators=500, max_depth=10, min_samples_split=5, min_samples_leaf=2, random_state=42)

# Voting Classifier с несколькими бустинговыми и случайными лесами
voting_boosting_optimized = VotingClassifier(
    estimators=[
        ('catboost', catboost_optimized),
        ('xgb', xgb_optimized),
        ('lgbm', lgbm_optimized),
        ('rf', rf_optimized)
    ],
    voting='soft'
)

# Обучение и оценка
voting_boosting_optimized.fit(X_train, y_train)
voting_optimized_pred = voting_boosting_optimized.predict(X_test)

# Метрики
roc_auc = roc_auc_score(y_test, voting_optimized_pred)
accuracy = accuracy_score(y_test, voting_optimized_pred)
precision = precision_score(y_test, voting_optimized_pred)
recall = recall_score(y_test, voting_optimized_pred)

print("\nVoting Classifier (CatBoost, XGBoost, LightGBM, RandomForest) метрики:")
print(f"ROC AUC: {roc_auc:.2f}")
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")


0:	total: 5.64ms	remaining: 6.76s
100:	total: 698ms	remaining: 7.59s
200:	total: 1.09s	remaining: 5.44s
300:	total: 1.72s	remaining: 5.13s
400:	total: 2.38s	remaining: 4.75s
500:	total: 3.02s	remaining: 4.22s
600:	total: 3.9s	remaining: 3.88s
700:	total: 4.15s	remaining: 2.95s
800:	total: 4.42s	remaining: 2.2s
900:	total: 4.69s	remaining: 1.55s
1000:	total: 4.95s	remaining: 983ms
1100:	total: 5.2s	remaining: 467ms
1199:	total: 5.46s	remaining: 0us
[LightGBM] [Info] Number of positive: 562, number of negative: 238
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000364 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 412
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.702500 -> initscore=0.859231
[LightGBM] [Info] Start training from score 0.859231

In [None]:
from tensorflow.keras.layers import BatchNormalization

# Нейронная сеть с улучшенной архитектурой
model = Sequential([
    Dense(512, activation='relu', input_shape=(X_train.shape[1],)),
    BatchNormalization(),
    Dropout(0.4),
    Dense(256, activation='relu'),
    BatchNormalization(),
    Dropout(0.4),
    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['AUC'])
history = model.fit(X_train_scaled, y_train, epochs=100, batch_size=32, validation_split=0.2, verbose=1)

# Предсказания и оценка
nn_pred = (model.predict(X_test_scaled) > 0.5).astype("int32")
roc_auc_nn = roc_auc_score(y_test, nn_pred)
accuracy_nn = accuracy_score(y_test, nn_pred)
precision_nn = precision_score(y_test, nn_pred)
recall_nn = recall_score(y_test, nn_pred)

print("\nГлубокая нейронная сеть с улучшенной архитектурой (Keras) метрики:")
print(f"ROC AUC: {roc_auc_nn:.2f}")
print(f"Accuracy: {accuracy_nn:.2f}")
print(f"Precision: {precision_nn:.2f}")
print(f"Recall: {recall_nn:.2f}")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 85ms/step - AUC: 0.6414 - loss: 0.8138 - val_AUC: 0.6750 - val_loss: 0.6405
Epoch 2/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 26ms/step - AUC: 0.7639 - loss: 0.6306 - val_AUC: 0.6858 - val_loss: 0.5929
Epoch 3/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 23ms/step - AUC: 0.7641 - loss: 0.6194 - val_AUC: 0.7124 - val_loss: 0.5558
Epoch 4/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - AUC: 0.7688 - loss: 0.5504 - val_AUC: 0.7096 - val_loss: 0.5465
Epoch 5/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - AUC: 0.8384 - loss: 0.4612 - val_AUC: 0.7047 - val_loss: 0.5322
Epoch 6/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - AUC: 0.8315 - loss: 0.4863 - val_AUC: 0.6949 - val_loss: 0.5323
Epoch 7/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step - 

In [None]:
from imblearn.over_sampling import SMOTE

# Применение SMOTE для балансировки классов
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_train, y_train)

# Обучение Voting Classifier на сбалансированном наборе
voting_boosting_optimized.fit(X_res, y_res)
voting_smote_pred = voting_boosting_optimized.predict(X_test)

# Метрики
roc_auc_smote = roc_auc_score(y_test, voting_smote_pred)
accuracy_smote = accuracy_score(y_test, voting_smote_pred)
precision_smote = precision_score(y_test, voting_smote_pred)
recall_smote = recall_score(y_test, voting_smote_pred)

print("\nVoting Classifier с балансировкой классов (SMOTE) метрики:")
print(f"ROC AUC: {roc_auc_smote:.2f}")
print(f"Accuracy: {accuracy_smote:.2f}")
print(f"Precision: {precision_smote:.2f}")
print(f"Recall: {recall_smote:.2f}")


0:	total: 5.32ms	remaining: 6.38s
100:	total: 1.07s	remaining: 11.6s
200:	total: 1.71s	remaining: 8.49s
300:	total: 2.28s	remaining: 6.82s
400:	total: 2.67s	remaining: 5.32s
500:	total: 3.06s	remaining: 4.27s
600:	total: 3.65s	remaining: 3.64s
700:	total: 4.13s	remaining: 2.94s
800:	total: 5.18s	remaining: 2.58s
900:	total: 5.5s	remaining: 1.82s
1000:	total: 5.99s	remaining: 1.19s
1100:	total: 6.5s	remaining: 585ms
1199:	total: 7.01s	remaining: 0us
[LightGBM] [Info] Number of positive: 562, number of negative: 562
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000241 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 438
[LightGBM] [Info] Number of data points in the train set: 1124, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000

Voting Classifier с балансировкой классов (SMOTE)

In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

# Настроим Stacking Classifier с несколькими лучшими моделями
stacking_model = StackingClassifier(
    estimators=[
        ('catboost', catboost_optimized),
        ('xgb', xgb_optimized),
        ('lgbm', lgbm_optimized),
        ('rf', rf_optimized)
    ],
    final_estimator=LogisticRegression(max_iter=1000, random_state=42),
    cv=5
)

# Обучение и оценка Stacking Classifier
stacking_model.fit(X_train, y_train)
stacking_pred = stacking_model.predict(X_test)

roc_auc_stack = roc_auc_score(y_test, stacking_pred)
accuracy_stack = accuracy_score(y_test, stacking_pred)
precision_stack = precision_score(y_test, stacking_pred)
recall_stack = recall_score(y_test, stacking_pred)

print("\nStacking Classifier метрики:")
print(f"ROC AUC: {roc_auc_stack:.2f}")
print(f"Accuracy: {accuracy_stack:.2f}")
print(f"Precision: {precision_stack:.2f}")
print(f"Recall: {recall_stack:.2f}")


0:	total: 11.1ms	remaining: 13.3s
100:	total: 577ms	remaining: 6.27s
200:	total: 1.08s	remaining: 5.36s
300:	total: 1.75s	remaining: 5.21s
400:	total: 2.49s	remaining: 4.96s
500:	total: 3.4s	remaining: 4.74s
600:	total: 3.77s	remaining: 3.76s
700:	total: 4.18s	remaining: 2.98s
800:	total: 4.77s	remaining: 2.38s
900:	total: 5.31s	remaining: 1.76s
1000:	total: 5.8s	remaining: 1.15s
1100:	total: 6.37s	remaining: 573ms
1199:	total: 6.9s	remaining: 0us
[LightGBM] [Info] Number of positive: 562, number of negative: 238
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000254 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 412
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.702500 -> initscore=0.859231
[LightGBM] [Info] Start training from score 0.859231

In [None]:
from tensorflow.keras.layers import Conv1D, Flatten, Reshape

# Настроим более глубокую и сложную сеть
model = Sequential([
    Reshape((X_train.shape[1], 1), input_shape=(X_train.shape[1],)),
    Conv1D(64, 2, activation='relu'),
    Flatten(),
    Dense(512, activation='relu'),
    BatchNormalization(),
    Dropout(0.5),
    Dense(256, activation='relu'),
    BatchNormalization(),
    Dropout(0.4),
    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['AUC'])
history = model.fit(X_train_scaled, y_train, epochs=100, batch_size=32, validation_split=0.2, verbose=1)

# Предсказания и оценка
nn_pred_deep = (model.predict(X_test_scaled) > 0.5).astype("int32")
roc_auc_nn_deep = roc_auc_score(y_test, nn_pred_deep)
accuracy_nn_deep = accuracy_score(y_test, nn_pred_deep)
precision_nn_deep = precision_score(y_test, nn_pred_deep)
recall_nn_deep = recall_score(y_test, nn_pred_deep)

print("\nГлубокая нейронная сеть с Conv1D и Dropout метрики:")
print(f"ROC AUC: {roc_auc_nn_deep:.2f}")
print(f"Accuracy: {accuracy_nn_deep:.2f}")
print(f"Precision: {precision_nn_deep:.2f}")
print(f"Recall: {recall_nn_deep:.2f}")


  super().__init__(**kwargs)


Epoch 1/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 69ms/step - AUC: 0.5590 - loss: 0.9443 - val_AUC: 0.7677 - val_loss: 0.5920
Epoch 2/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 21ms/step - AUC: 0.7588 - loss: 0.6443 - val_AUC: 0.7263 - val_loss: 0.5490
Epoch 3/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - AUC: 0.7505 - loss: 0.6271 - val_AUC: 0.6984 - val_loss: 0.5451
Epoch 4/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step - AUC: 0.8311 - loss: 0.4894 - val_AUC: 0.7178 - val_loss: 0.5311
Epoch 5/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 26ms/step - AUC: 0.8306 - loss: 0.4829 - val_AUC: 0.7079 - val_loss: 0.5448
Epoch 6/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 34ms/step - AUC: 0.8472 - loss: 0.4456 - val_AUC: 0.7145 - val_loss: 0.5411
Epoch 7/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 30ms/step -



[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step

Глубокая нейронная сеть с Conv1D и Dropout метрики:
ROC AUC: 0.66
Accuracy: 0.69
Precision: 0.79
Recall: 0.75


In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import cross_val_score

# Первый уровень - простые классификаторы
level1_estimators = [
    ('rf', RandomForestClassifier(n_estimators=200, random_state=42)),
    ('gb', GradientBoostingClassifier(n_estimators=100, random_state=42)),
    ('xgb', xgb_optimized),
    ('catboost', catboost_optimized),
    ('lgbm', lgbm_optimized)
]

# Stacking Classifier для второго уровня
stacked_model_lvl1 = StackingClassifier(estimators=level1_estimators, final_estimator=LogisticRegression(), cv=5)

# Обучение второго уровня модели
stacked_model_lvl1.fit(X_train, y_train)
stacked_lvl1_pred = stacked_model_lvl1.predict(X_test)

# Оценка мета-ансамбля
roc_auc_lvl1 = roc_auc_score(y_test, stacked_lvl1_pred)
accuracy_lvl1 = accuracy_score(y_test, stacked_lvl1_pred)
precision_lvl1 = precision_score(y_test, stacked_lvl1_pred)
recall_lvl1 = recall_score(y_test, stacked_lvl1_pred)

print("\nMeta-Ensemble (Stacking на двух уровнях) метрики:")
print(f"ROC AUC: {roc_auc_lvl1:.2f}")
print(f"Accuracy: {accuracy_lvl1:.2f}")
print(f"Precision: {precision_lvl1:.2f}")
print(f"Recall: {recall_lvl1:.2f}")


0:	total: 15.8ms	remaining: 18.9s
100:	total: 913ms	remaining: 9.93s
200:	total: 1.66s	remaining: 8.26s
300:	total: 2.58s	remaining: 7.71s
400:	total: 3.74s	remaining: 7.45s
500:	total: 4.61s	remaining: 6.43s
600:	total: 6.47s	remaining: 6.45s
700:	total: 6.93s	remaining: 4.93s
800:	total: 7.34s	remaining: 3.66s
900:	total: 7.6s	remaining: 2.52s
1000:	total: 7.86s	remaining: 1.56s
1100:	total: 8.12s	remaining: 730ms
1199:	total: 8.4s	remaining: 0us
[LightGBM] [Info] Number of positive: 562, number of negative: 238
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000221 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 412
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.702500 -> initscore=0.859231
[LightGBM] [Info] Start training from score 0.85923

In [None]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.0.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.3-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.6-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.0.0-py3-none-any.whl (362 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m362.8/362.8 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.13.3-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.2/233.2 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.8.2-py3-none-any.whl (11 kB)
Downloading Mako-1.3.6-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Mak

In [None]:
# Импортируем Optuna
import optuna

def objective(trial):
    # Параметры для RandomForest
    n_estimators = trial.suggest_int("n_estimators", 100, 1000)
    max_depth = trial.suggest_int("max_depth", 3, 10)

    # Модель RandomForest с гиперпараметрами
    model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)

    # Кросс-валидация
    score = cross_val_score(model, X_train, y_train, cv=5, scoring="roc_auc").mean()
    return score

# Оптимизация
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

print("Best Hyperparameters:", study.best_params)


[I 2024-10-26 18:42:54,091] A new study created in memory with name: no-name-c6f2761c-9975-4bd8-8b21-194e2e929b06
[I 2024-10-26 18:43:12,808] Trial 0 finished with value: 0.7918125271785422 and parameters: {'n_estimators': 723, 'max_depth': 8}. Best is trial 0 with value: 0.7918125271785422.
[I 2024-10-26 18:43:24,135] Trial 1 finished with value: 0.7905198722776626 and parameters: {'n_estimators': 454, 'max_depth': 7}. Best is trial 0 with value: 0.7918125271785422.
[I 2024-10-26 18:43:30,448] Trial 2 finished with value: 0.7741800205772386 and parameters: {'n_estimators': 258, 'max_depth': 3}. Best is trial 0 with value: 0.7918125271785422.
[I 2024-10-26 18:43:40,591] Trial 3 finished with value: 0.7803516511104537 and parameters: {'n_estimators': 895, 'max_depth': 4}. Best is trial 0 with value: 0.7918125271785422.
[I 2024-10-26 18:43:41,839] Trial 4 finished with value: 0.7804278771148827 and parameters: {'n_estimators': 133, 'max_depth': 4}. Best is trial 0 with value: 0.791812527

Best Hyperparameters: {'n_estimators': 297, 'max_depth': 8}


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Инициализация модели с оптимальными параметрами
optimized_rf_model = RandomForestClassifier(n_estimators=297, max_depth=8, random_state=42)

# Обучение модели на тренировочных данных
optimized_rf_model.fit(X_train, y_train)

# Предсказание на тестовых данных
optimized_rf_pred = optimized_rf_model.predict(X_test)

# Расчет метрик
roc_auc_opt_rf = roc_auc_score(y_test, optimized_rf_pred)
accuracy_opt_rf = accuracy_score(y_test, optimized_rf_pred)
precision_opt_rf = precision_score(y_test, optimized_rf_pred)
recall_opt_rf = recall_score(y_test, optimized_rf_pred)

print("Оптимизированная Random Forest модель метрики:")
print(f"ROC AUC: {roc_auc_opt_rf:.3f}")
print(f"Accuracy: {accuracy_opt_rf:.3f}")
print(f"Precision: {precision_opt_rf:.3f}")
print(f"Recall: {recall_opt_rf:.3f}")


Оптимизированная Random Forest модель метрики:
ROC AUC: 0.638
Accuracy: 0.745
Precision: 0.760
Recall: 0.920


In [None]:
# Обучение Random Forest
rf_model = RandomForestClassifier(
    n_estimators=200,
    criterion='entropy',
    max_depth=20,
    random_state=48,
    min_samples_split=21
)

rf_model.fit(X_train, y_train)

# Прогноз на тестовых данных
rf_pred = rf_model.predict_proba(X_test)[:,1]

# Расчет метрик для Random Forest
rf_roc_auc = roc_auc_score(y_test, rf_pred)

print("Random Forest метрики:")
print(f"ROC AUC: {rf_roc_auc:.2f}")
print(f"ROC AUC: {rf_roc_auc:.5f}")
# Обучение Gradient Boosting
gb_model = GradientBoostingClassifier(
    criterion = 'squared_error',
    learning_rate = 0.01,
    n_estimators = 250,
    random_state = 42)
gb_model.fit(X_train, y_train)

# Прогноз на тестовых данных
gb_pred = gb_model.predict_proba(X_test)[:,1]

# Расчет метрик для Gradient Boosting
gb_roc_auc = roc_auc_score(y_test, gb_pred)

print("\nGradient Boosting метрики:")
print(f"ROC AUC: {gb_roc_auc:.2f}")
print(f"ROC AUC: {gb_roc_auc:.5f}")

Random Forest метрики:
ROC AUC: 0.79
ROC AUC: 0.79079

Gradient Boosting метрики:
ROC AUC: 0.77
ROC AUC: 0.77361


In [None]:
# Обучение MLP (Multi-Layer Perceptron) нейронной сети
mlp_model = MLPClassifier(hidden_layer_sizes=(30,), max_iter=500, random_state=42)
mlp_model.fit(X_train, y_train)

# Прогноз на тестовых данных
mlp_pred = mlp_model.predict_proba(X_test)[:,1]

# Расчет метрик для MLP нейронной сети
mlp_roc_auc = roc_auc_score(y_test, mlp_pred)

print("\nMLP (Neural Network) метрики:")
print(f"ROC AUC: {mlp_roc_auc:.2f}")
print(f"ROC AUC: {mlp_roc_auc:.5f}")


MLP (Neural Network) метрики:
ROC AUC: 0.60
ROC AUC: 0.60250


In [None]:
rf_model = RandomForestClassifier(
    n_estimators=15,
    criterion='entropy',
    max_depth=8,
    random_state=42,
    min_samples_split=10,
    class_weight='balanced_subsample'
)

rf_model.fit(X_train, y_train)

# Прогноз на тестовых данных
rf_pred = rf_model.predict(X_test)
rf_pred_prob = rf_model.predict_proba(X_test)[:, 1]

# Расчет метрик для Random Forest
rf_roc_auc = roc_auc_score(y_test, rf_pred_prob)
rf_accuracy = accuracy_score(y_test, rf_pred)
rf_precision = precision_score(y_test, rf_pred)
rf_recall = recall_score(y_test, rf_pred)

print("Random Forest метрики:")
print(f"ROC AUC: {rf_roc_auc:.2f}")
print(f"Accuracy: {rf_accuracy:.2f}")
print(f"Precision: {rf_precision:.2f}")
print(f"Recall: {rf_recall:.2f}")

# Обучение Gradient Boosting
gb_model = GradientBoostingClassifier(
    learning_rate= 0.005,
    n_estimators= 10,
    min_weight_fraction_leaf= 0.077,
    max_depth= 2, max_features= 9,
    subsample= 0.8,
    min_samples_split= 3
)
gb_model.fit(X_train, y_train)

# Прогноз на тестовых данных
gb_pred = gb_model.predict(X_test)
gb_pred_prob = gb_model.predict_proba(X_test)[:, 1]


# Расчет метрик для Gradient Boosting
gb_roc_auc = roc_auc_score(y_test, gb_pred_prob)
gb_accuracy = accuracy_score(y_test, gb_pred)
gb_precision = precision_score(y_test, gb_pred)
gb_recall = recall_score(y_test, gb_pred)

print("\nGradient Boosting метрики:")
print(f"ROC AUC: {gb_roc_auc:.2f}")
print(f"Accuracy: {gb_accuracy:.2f}")
print(f"Precision: {gb_precision:.2f}")
print(f"Recall: {gb_recall:.2f}")

Random Forest метрики:
ROC AUC: 0.77
Accuracy: 0.74
Precision: 0.81
Recall: 0.82

Gradient Boosting метрики:
ROC AUC: 0.79
Accuracy: 0.69
Precision: 0.69
Recall: 1.00


In [None]:
# Обучение MLP (Multi-Layer Perceptron) нейронной сети
mlp_model = MLPClassifier(hidden_layer_sizes=(30,), max_iter=500, random_state=42)
mlp_model.fit(X_train, y_train)

# Прогноз на тестовых данных
mlp_pred = mlp_model.predict(X_test)
mlp_pred_prob = mlp_model.predict_proba(X_test)[:, 1]

# Расчет метрик для MLP нейронной сети
mlp_roc_auc = roc_auc_score(y_test, mlp_pred_prob)
mlp_accuracy = accuracy_score(y_test, mlp_pred)
mlp_precision = precision_score(y_test, mlp_pred)
mlp_recall = recall_score(y_test, mlp_pred)

print("\nMLP (Neural Network) метрики:")
print(f"ROC AUC: {mlp_roc_auc:.2f}")
print(f"Accuracy: {mlp_accuracy:.2f}")
print(f"Precision: {mlp_precision:.2f}")
print(f"Recall: {mlp_recall:.2f}")


MLP (Neural Network) метрики:
ROC AUC: 0.60
Accuracy: 0.65
Precision: 0.72
Recall: 0.79


In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score

# Загружаем данные
data = pd.read_csv('german.csv', sep=';')
X = data.iloc[:, 1:].to_numpy()
y = data.iloc[:, 0].to_numpy()

# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Обучение Random Forest с оптимальными параметрами
rf_model = RandomForestClassifier(
    n_estimators=200,
    criterion='entropy',
    max_depth=20,
    random_state=48,
    min_samples_split=21,
    class_weight='balanced_subsample'
)
rf_model.fit(X_train, y_train)

# Прогноз на тестовых данных для Random Forest
rf_pred_prob = rf_model.predict_proba(X_test)[:, 1]
rf_roc_auc = roc_auc_score(y_test, rf_pred_prob)

print("Random Forest метрики:")
print(f"ROC AUC: {rf_roc_auc:.5f}")

# Обучение Gradient Boosting с оптимальными параметрами
gb_model = GradientBoostingClassifier(
    criterion='squared_error',
    learning_rate=0.01,
    n_estimators=250,
    max_depth=2,
    max_features=9,
    min_weight_fraction_leaf=0.135,
    subsample=0.8,
    random_state=42
)
gb_model.fit(X_train, y_train)

# Прогноз на тестовых данных для Gradient Boosting
gb_pred_prob = gb_model.predict_proba(X_test)[:, 1]
gb_roc_auc = roc_auc_score(y_test, gb_pred_prob)

print("\nGradient Boosting метрики:")
print(f"ROC AUC: {gb_roc_auc:.5f}")

# Обучение MLP нейронной сети с оптимальными параметрами
mlp_model = MLPClassifier(
    solver='lbfgs',
    alpha=1e-07,
    hidden_layer_sizes=(19,),
    max_iter=800,
    random_state=42
)
mlp_model.fit(X_train, y_train)

# Прогноз на тестовых данных для MLP нейронной сети
mlp_pred_prob = mlp_model.predict_proba(X_test)[:, 1]
mlp_roc_auc = roc_auc_score(y_test, mlp_pred_prob)

print("\nMLP (Neural Network) метрики:")
print(f"ROC AUC: {mlp_roc_auc:.5f}")

# Ансамблирование с Voting Classifier (мягкое голосование для работы с вероятностями)
voting_model = VotingClassifier(
    estimators=[
        ('rf', rf_model),
        ('gb', gb_model),
        ('mlp', mlp_model)
    ],
    voting='soft'
)
voting_model.fit(X_train, y_train)

# Прогноз на тестовых данных для ансамбля Voting Classifier
voting_pred_prob = voting_model.predict_proba(X_test)[:, 1]
voting_roc_auc = roc_auc_score(y_test, voting_pred_prob)
voting_accuracy = accuracy_score(y_test, voting_model.predict(X_test))
voting_precision = precision_score(y_test, voting_model.predict(X_test))
voting_recall = recall_score(y_test, voting_model.predict(X_test))

print("\nАнсамбль (Voting Classifier) метрики:")
print(f"ROC AUC: {voting_roc_auc:.5f}")
print(f"Accuracy: {voting_accuracy:.2f}")
print(f"Precision: {voting_precision:.2f}")
print(f"Recall: {voting_recall:.2f}")


Random Forest метрики:
ROC AUC: 0.79173

Gradient Boosting метрики:
ROC AUC: 0.78226

MLP (Neural Network) метрики:
ROC AUC: 0.77151

Ансамбль (Voting Classifier) метрики:
ROC AUC: 0.79289
Accuracy: 0.77
Precision: 0.79
Recall: 0.90


In [None]:
# Обучение Random Forest
rf_model = RandomForestClassifier(criterion='entropy', class_weight='balanced_subsample', max_depth=4, n_estimators=11,random_state=42)
rf_model.fit(X_train, y_train)

# Прогноз на тестовых данных
rf_pred = rf_model.predict(X_test)
rf_pred_prob = rf_model.predict_proba(X_test)[:, 1]

# Расчет метрик для Random Forest
rf_roc_auc = roc_auc_score(y_test, rf_pred_prob)
rf_accuracy = accuracy_score(y_test, rf_pred)
rf_precision = precision_score(y_test, rf_pred)
rf_recall = recall_score(y_test, rf_pred)

print("Random Forest метрики:")
print(f"ROC AUC: {rf_roc_auc:.2f}")
print(f"Accuracy: {rf_accuracy:.2f}")
print(f"Precision: {rf_precision:.2f}")
print(f"Recall: {rf_recall:.2f}")

# Обучение Gradient Boosting
gb_model = GradientBoostingClassifier(n_estimators=40, learning_rate=0.2,min_weight_fraction_leaf=0.135,max_depth=2, max_features=9, subsample=0.8, random_state=42)
gb_model.fit(X_train, y_train)

# Прогноз на тестовых данных
gb_pred = gb_model.predict(X_test)
gb_pred_prob = gb_model.predict_proba(X_test)[:, 1]

# Расчет метрик для Gradient Boosting
gb_roc_auc = roc_auc_score(y_test, gb_pred_prob)
gb_accuracy = accuracy_score(y_test, gb_pred)
gb_precision = precision_score(y_test, gb_pred)
gb_recall = recall_score(y_test, gb_pred)

print("\nGradient Boosting метрики:")
print(f"ROC AUC: {gb_roc_auc:.2f}")
print(f"Accuracy: {gb_accuracy:.2f}")
print(f"Precision: {gb_precision:.2f}")
print(f"Recall: {gb_recall:.2f}")

Random Forest метрики:
ROC AUC: 0.82
Accuracy: 0.79
Precision: 0.89
Recall: 0.79

Gradient Boosting метрики:
ROC AUC: 0.81
Accuracy: 0.77
Precision: 0.79
Recall: 0.91
