In [71]:
import pandas as pd
import numpy as np

from tqdm import tqdm

from sklearn.model_selection import train_test_split, StratifiedKFold

from imblearn.combine import SMOTEENN
from sklearn.metrics import classification_report, roc_auc_score, precision_score, recall_score, f1_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

## Считываем данные и разбиваем их на тренировочную и тестовую выборки

In [72]:
with open('data_for_models.csv') as f:
    data = pd.read_csv(f)

In [73]:
numeric_cols = data.select_dtypes(include='number').columns.tolist() # оставляю только числовые столбцы

data = data.astype({
    col: 'int64' for col in numeric_cols 
})


In [74]:
random_state = 2025 # нужен для повторяемости результатов

pos_data = data.query("target == 1")

# оставляем только 33% случайных сэмплов класса 0 для каждого диска
neg_data = (
    data
    .query("target == 0")
    .groupby("serial_number", as_index=False)
    .apply(lambda x: x.sample(frac=0.33, random_state=random_state))
    .reset_index(drop=True)
)

# новый датасет
data = pd.concat((neg_data, pos_data), axis=0)

  .apply(lambda x: x.sample(frac=0.33, random_state=random_state))


In [75]:
data = data.drop(columns = ['serial_number', 'date'])

In [76]:
data

Unnamed: 0,smart_5_last,smart_9_last,smart_187_last,smart_188_last,smart_192_last,smart_198_last,smart_199_last,smart_240_last,smart_241_last,smart_242_last,block,smart_5_diff,smart_187_diff,smart_198_diff,smart_199_max,AUC_smart_5_raw,AUC_smart_187_raw,AUC_smart_198_raw,target
0,0,16507,0,0,3,0,0,15948,98121318264,234810359494,23,0,0,0,0,0,0,0,0
1,0,13631,0,0,3,0,0,13097,88405181240,194441877008,19,0,0,0,0,0,0,0,0
2,0,6429,0,0,3,0,0,5939,62977305968,106747494514,9,0,0,0,0,0,0,0,0
3,0,3553,0,0,0,0,0,3110,43138957032,69458441764,5,0,0,0,0,0,0,0,0
4,0,7147,0,0,3,0,0,6653,66249905800,115393592954,10,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
218623,0,15943,820,4295032833,16,0,0,15527,87198911944,233614901482,22,0,0,0,0,0,23780,0,1
219435,0,13306,0,0,3,0,0,13086,83943199456,204123480846,19,0,0,0,0,0,0,0,1
225958,0,7243,18,0,0,96,0,6840,58561660336,88530467778,10,0,10,32,0,0,327,2160,1
226309,0,6186,0,0,0,0,0,5797,55168531248,74917161090,9,0,0,0,0,0,0,0,1


In [77]:
# Т.к. нам нужно, чтобы в тестовой выборки с реальными данными не осталось 2-3 экземпляра класса 1 - проведём следующее разделение

# Разделяем данные класса 1 и класса 0
class_1 = data[data['target'] == 1]
class_0 = data[data['target'] == 0]

# Фиксируем 30 объектов класса 1 для тестовой выборки (можно попробовать взять меньше - больше)
class_1_train, class_1_test = train_test_split(class_1, test_size=30, random_state=random_state)

# Добавляем пропорциональное количество данных класса 0
class_0_train, class_0_test = train_test_split(class_0, test_size=len(class_1_test) * 10, random_state=random_state)

# Собираем тренировочные и тестовые выборки
train_data = pd.concat([class_1_train, class_0_train])
test_data = pd.concat([class_1_test, class_0_test])

# Перемешиваем данные
train_data = train_data.sample(frac=1, random_state=random_state).reset_index()
test_data = test_data.sample(frac=1, random_state=random_state).reset_index()

# Разделяем на X и y для обучения и теста
X_train, y_train = train_data.drop(columns=['target']), train_data['target']
X_test, y_test = test_data.drop(columns=['target']), test_data['target']

## Отдельное обучение одной модели

In [None]:
# Определение модели и кросс-валидации
clf = RandomForestClassifier(random_state=random_state)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)

# Заводим массив для метрик кросс-валидации
roc_auc_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

# Кросс-валидация с балансировкой данных с помощью SMOTEENN
for train_idx, val_idx in tqdm(cv.split(X_train, y_train), total=cv.get_n_splits(), desc="Cross-validation"):
    X_fold_train, X_fold_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_fold_train, y_fold_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    smote_enn = SMOTEENN(random_state=random_state)
    X_resampled, y_resampled = smote_enn.fit_resample(X_fold_train, y_fold_train)
    
    # Обучение модели на синтетических данных
    clf.fit(X_resampled, y_resampled)
    
    # Оценка на валидационной выборке (реальные данные)
    y_val_pred = clf.predict(X_fold_val)
    y_val_pred_proba = clf.predict_proba(X_fold_val)[:, 1]
    
    # Расчёт метрик
    roc_auc = roc_auc_score(y_fold_val, y_val_pred_proba)
    precision = precision_score(y_fold_val, y_val_pred)
    recall = recall_score(y_fold_val, y_val_pred)
    f1 = f1_score(y_fold_val, y_val_pred)
    
    roc_auc_scores.append(roc_auc)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)

# Вывод метрик кросс-валидации
print(f"Средний ROC-AUC: {np.mean(roc_auc_scores):.4f} ± {np.std(roc_auc_scores):.4f}")
print(f"Средний Precision: {np.mean(precision_scores):.4f} ± {np.std(precision_scores):.4f}")
print(f"Средний Recall: {np.mean(recall_scores):.4f} ± {np.std(recall_scores):.4f}")
print(f"Средний F1-Score: {np.mean(f1_scores):.4f} ± {np.std(f1_scores):.4f}")

# Финальное тестирование на реальных данных
smote_enn = SMOTEENN(random_state=random_state)
X_resampled, y_resampled = smote_enn.fit_resample(X_train, y_train)

clf.fit(X_resampled, y_resampled)

y_test_pred = clf.predict(X_test)
y_test_pred_proba = clf.predict_proba(X_test)[:, 1]

print("\nКлассификационный отчет на тестовых данных (реальные данные):")
print(classification_report(y_test, y_test_pred, digits=4))

print(f"ROC-AUC на тестовых данных: {roc_auc_score(y_test, y_test_pred_proba):.4f}")
print(f"Precision на тестовых данных: {precision_score(y_test, y_test_pred):.4f}")
print(f"Recall на тестовых данных: {recall_score(y_test, y_test_pred):.4f}")
print(f"F1-Score на тестовых данных: {f1_score(y_test, y_test_pred):.4f}")

## Обучение сразу нескольких моделей

In [78]:
# Определение моделей
models = {
    "Logistic Regression": LogisticRegression(random_state=random_state, max_iter=1000),
    "Random Forest": RandomForestClassifier(random_state=random_state),
    "Extra Trees": ExtraTreesClassifier(random_state=random_state),
    "LightGBM": LGBMClassifier(random_state=random_state),
    "CatBoost": CatBoostClassifier(verbose=0, random_state=random_state),
    "XGBoost": XGBClassifier(eval_metric="logloss", random_state=random_state)
}

# Определение кросс-валидации
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)

# Массив для хранения результатов метрик по каждой моделей
results = []

# Обучение для каждой модели
for model_name, model in models.items():
    print(f"\nОбучение модели: {model_name}")
    roc_auc_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []

    # Кросс-валидация с балансировкой данных с помощью SMOTEENN
    for train_idx, val_idx in tqdm(cv.split(X_train, y_train), total=cv.get_n_splits(), desc=f"{model_name} CV"):
        X_fold_train, X_fold_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_fold_train, y_fold_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

        smote_enn = SMOTEENN(random_state=random_state)
        X_resampled, y_resampled = smote_enn.fit_resample(X_fold_train, y_fold_train)

        # Обучение модели на синтетических данных
        model.fit(X_resampled, y_resampled)

        # Оценка на валидационной выборке (реальные данные)
        y_val_pred = model.predict(X_fold_val)
        y_val_pred_proba = model.predict_proba(X_fold_val)[:, 1]

        roc_auc = roc_auc_score(y_fold_val, y_val_pred_proba)
        precision = precision_score(y_fold_val, y_val_pred)
        recall = recall_score(y_fold_val, y_val_pred)
        f1 = f1_score(y_fold_val, y_val_pred)

        roc_auc_scores.append(roc_auc)
        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)

    # Сохранение средних значений метрик
    results.append({
        "Model": model_name,
        "ROC-AUC (CV)": f"{np.mean(roc_auc_scores):.4f} ± {np.std(roc_auc_scores):.4f}",
        "Precision (CV)": f"{np.mean(precision_scores):.4f} ± {np.std(precision_scores):.4f}",
        "Recall (CV)": f"{np.mean(recall_scores):.4f} ± {np.std(recall_scores):.4f}",
        "F1-Score (CV)": f"{np.mean(f1_scores):.4f} ± {np.std(f1_scores):.4f}"
    })

    # Финальное тестирование на реальных данных
    smote_enn = SMOTEENN(random_state=random_state)
    X_resampled, y_resampled = smote_enn.fit_resample(X_train, y_train)
    model.fit(X_resampled, y_resampled)

    y_test_pred = model.predict(X_test)
    y_test_pred_proba = model.predict_proba(X_test)[:, 1]

    test_metrics = {
        "ROC-AUC (Test)": roc_auc_score(y_test, y_test_pred_proba),
        "Precision (Test)": precision_score(y_test, y_test_pred),
        "Recall (Test)": recall_score(y_test, y_test_pred),
        "F1-Score (Test)": f1_score(y_test, y_test_pred)
    }

    # Добавление тестовых метрик в финальные результаты
    results[-1].update({key: f"{value:.4f}" for key, value in test_metrics.items()})

# Вывод таблицы результатов
results_df = pd.DataFrame(results)

print("\nРезультаты моделей:")
results_df


Обучение модели: Logistic Regression


Logistic Regression CV:   0%|          | 0/5 [00:04<?, ?it/s]


KeyboardInterrupt: 

In [None]:
results_df.to_csv("results/models_results_resampled.csv", index=False)

## Обучение с помощью балансировки весов классов (class_weight)

In [81]:
# Определение моделей
models = {
    "Logistic Regression": LogisticRegression(random_state=random_state, max_iter=1000, class_weight="balanced"),
    "Random Forest": RandomForestClassifier(random_state=random_state, class_weight="balanced"),
    "Extra Trees": ExtraTreesClassifier(random_state=random_state, class_weight="balanced"),
    "LightGBM": LGBMClassifier(random_state=random_state, class_weight="balanced"),
    "CatBoost": CatBoostClassifier(verbose=0, random_state=random_state, auto_class_weights="Balanced"),
    "XGBoost": XGBClassifier(
        eval_metric="logloss",
        random_state=random_state,
        scale_pos_weight=len(y_train[y_train == 0]) / len(y_train[y_train == 1]),
    ),
}

# Определение кросс-валидации
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)

# Массив для хранения результатов метрик по каждой моделей
results = []

# Обучение для каждой модели
for model_name, model in models.items():
    print(f"\nОбучение модели: {model_name}")
    roc_auc_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []

    # Кросс-валидация
    for train_idx, val_idx in tqdm(cv.split(X_train, y_train), total=cv.get_n_splits(), desc=f"{model_name} CV"):
        X_fold_train, X_fold_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_fold_train, y_fold_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

        # Обучение на реальных данных
        model.fit(X_fold_train, y_fold_train)

        # Оценка на валидационной выборке (реальные данные)
        y_val_pred = model.predict(X_fold_val)
        y_val_pred_proba = model.predict_proba(X_fold_val)[:, 1]

        roc_auc = roc_auc_score(y_fold_val, y_val_pred_proba)
        precision = precision_score(y_fold_val, y_val_pred)
        recall = recall_score(y_fold_val, y_val_pred)
        f1 = f1_score(y_fold_val, y_val_pred)

        roc_auc_scores.append(roc_auc)
        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)

    # Сохранение средних значений метрик
    results.append({
        "Model": model_name,
        "ROC-AUC (CV)": f"{np.mean(roc_auc_scores):.4f} ± {np.std(roc_auc_scores):.4f}",
        "Precision (CV)": f"{np.mean(precision_scores):.4f} ± {np.std(precision_scores):.4f}",
        "Recall (CV)": f"{np.mean(recall_scores):.4f} ± {np.std(recall_scores):.4f}",
        "F1-Score (CV)": f"{np.mean(f1_scores):.4f} ± {np.std(f1_scores):.4f}",
    })

    # Финальное тестирование на реальных данных
    model.fit(X_train, y_train)
    y_test_pred = model.predict(X_test)
    y_test_pred_proba = model.predict_proba(X_test)[:, 1]

    test_metrics = {
        "ROC-AUC (Test)": roc_auc_score(y_test, y_test_pred_proba),
        "Precision (Test)": precision_score(y_test, y_test_pred),
        "Recall (Test)": recall_score(y_test, y_test_pred),
        "F1-Score (Test)": f1_score(y_test, y_test_pred),
    }

    # Добавление тестовых метрик в финальные результаты
    results[-1].update({key: f"{value:.4f}" for key, value in test_metrics.items()})

# Вывод таблицы результатов
results_df = pd.DataFrame(results)

print("\nРезультаты моделей:")
results_df


Обучение модели: Logistic Regression


Logistic Regression CV: 100%|██████████| 5/5 [00:02<00:00,  2.37it/s]



Обучение модели: Random Forest


Random Forest CV: 100%|██████████| 5/5 [00:25<00:00,  5.20s/it]



Обучение модели: Extra Trees


Extra Trees CV: 100%|██████████| 5/5 [00:12<00:00,  2.41s/it]



Обучение модели: LightGBM


LightGBM CV:   0%|          | 0/5 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 111, number of negative: 63448
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002742 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2095
[LightGBM] [Info] Number of data points in the train set: 63559, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000


LightGBM CV:  20%|██        | 1/5 [00:00<00:01,  3.71it/s]

[LightGBM] [Info] Number of positive: 110, number of negative: 63449
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001518 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2094
[LightGBM] [Info] Number of data points in the train set: 63559, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000


LightGBM CV:  40%|████      | 2/5 [00:00<00:00,  3.37it/s]

[LightGBM] [Info] Number of positive: 110, number of negative: 63449
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003901 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2101
[LightGBM] [Info] Number of data points in the train set: 63559, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000


LightGBM CV:  80%|████████  | 4/5 [00:01<00:00,  3.61it/s]

[LightGBM] [Info] Number of positive: 110, number of negative: 63449
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001557 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2096
[LightGBM] [Info] Number of data points in the train set: 63559, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
[LightGBM] [Info] Number of positive: 111, number of negative: 63449
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003640 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2097
[LightGBM] [Info] Number of data points in the train set: 63560, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightG

LightGBM CV: 100%|██████████| 5/5 [00:01<00:00,  3.62it/s]


[LightGBM] [Info] Number of positive: 138, number of negative: 79311
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001922 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2158
[LightGBM] [Info] Number of data points in the train set: 79449, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000

Обучение модели: CatBoost


CatBoost CV: 100%|██████████| 5/5 [01:09<00:00, 13.82s/it]



Обучение модели: XGBoost


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

XGBoost CV: 100%|██████████| 5/5 [00:01<00:00,  3.57it/s]
Parameters: { "use_label_encoder" } are not used.




Результаты моделей:


Unnamed: 0,Model,ROC-AUC (CV),Precision (CV),Recall (CV),F1-Score (CV),ROC-AUC (Test),Precision (Test),Recall (Test),F1-Score (Test)
0,Logistic Regression,0.9601 ± 0.0195,0.0119 ± 0.0029,0.9206 ± 0.0346,0.0235 ± 0.0056,0.8013,0.3235,0.7333,0.449
1,Random Forest,0.9816 ± 0.0277,1.0000 ± 0.0000,0.8767 ± 0.0495,0.9336 ± 0.0283,0.9492,1.0,0.7667,0.8679
2,Extra Trees,0.9851 ± 0.0136,1.0000 ± 0.0000,0.7820 ± 0.0633,0.8762 ± 0.0412,0.9492,1.0,0.6333,0.7755
3,LightGBM,0.9505 ± 0.0369,0.9546 ± 0.0428,0.8913 ± 0.0452,0.9214 ± 0.0393,0.9857,1.0,0.8,0.8889
4,CatBoost,0.9809 ± 0.0083,0.7758 ± 0.0961,0.8913 ± 0.0452,0.8264 ± 0.0610,0.9266,1.0,0.8,0.8889
5,XGBoost,0.9761 ± 0.0201,0.9623 ± 0.0555,0.8556 ± 0.0543,0.9046 ± 0.0461,0.9428,1.0,0.8,0.8889


In [59]:
results_df.to_csv("results/models_results_weighted.csv", index=False)

## Выводим результаты

In [None]:
with open('results/models_results_resampled.csv') as f:
    res = pd.read_csv(f)

res

In [None]:
with open('results/models_results_weighted.csv') as f:
    res = pd.read_csv(f)

res