In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import PowerTransformer, OrdinalEncoder, OneHotEncoder, LabelEncoder
from sklearn.model_selection import LeaveOneOut, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
import optuna
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
    balanced_accuracy_score
)
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve, auc, precision_recall_curve
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
import sklearn
import mlflow
import matplotlib.ticker as ticker
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split
import time
import joblib
from tqdm.notebook import tqdm
import numpy as np 
from scipy import interpolate
import warnings
import boto3
from dotenv import load_dotenv  
load_dotenv()

mlflow.set_tracking_uri("http://84.201.144.227:8000")
warnings.filterwarnings("ignore")

pd.options.display.max_columns = None
sklearn.set_config(transform_output='pandas')

  import pkg_resources  # noqa: TID251


123

In [2]:
df = pd.read_csv('data/student.csv').drop(columns=['number', 'Id'])
df['Attendance'] = df['Attendance'].map({"Always": 3, "Sometimes": 2, "Never": 1, "3": None})
df['Scholarship'] = df['Scholarship'].fillna("0%").str.replace("%", "").astype(int)

grade_mapping = {"Fail": 0, "FD": 1, "DD": 2, "DC": 3, "CC": 4, "CB": 5, "BB": 6, "BA": 7, "AA": 8}
df['Grade'] = df['Grade'].map(grade_mapping)

X = df.drop(columns=['Grade'])
y = df['Grade']

le = LabelEncoder()
y_encoded = pd.Series(le.fit_transform(y))

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

In [4]:
X_test.to_csv("valid_X_test.csv", index=False)

In [5]:
X_test.to_csv("X_test.csv")
y_test.to_csv("y_test.csv")

In [7]:
ohe_pipe = Pipeline([
    ("SimpleImputer", SimpleImputer(strategy="constant", fill_value="unknown")),
    ("OneHotEncoder", OneHotEncoder(sparse_output=False, handle_unknown="ignore"))
])

ord_pipe = Pipeline([
    ("SimpleImputer", SimpleImputer(strategy="constant", fill_value="unknown")),
    ("OrdinalEncoder", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1))
])

num_pipe = Pipeline([
    ("PowerTransformer", PowerTransformer())
])

ohe_list = ['Sex', 'High_School_Type', 'Transportation']
ord_list = X.select_dtypes(include="object").columns.drop(ohe_list)
num_list = X.select_dtypes(exclude="object").columns

In [8]:
transform = ColumnTransformer([
    ("ord_pipe", ord_pipe, ord_list),
    ("num_pipe", num_pipe, num_list),
    ("ohe_pipe", ohe_pipe, ohe_list)
])


In [9]:
loo = LeaveOneOut()

In [10]:
def objective_xgb(trial):
    params = {
        "model__estimator__n_estimators": trial.suggest_categorical("model__estimator__n_estimators", [1000]),
        "model__estimator__max_depth": trial.suggest_int("model__estimator__max_depth", 6, 10),
        "model__estimator__learning_rate": trial.suggest_float("model__estimator__learning_rate", 0.001, 0.05),
        "model__estimator__subsample": trial.suggest_float("model__estimator__subsample", 0.5, 1.0),
        "model__estimator__colsample_bytree": trial.suggest_float("model__estimator__colsample_bytree", 0.1, 0.6),
        "model__estimator__gamma": trial.suggest_float("model__estimator__gamma", 0.0, 1.0),
        "model__estimator__reg_alpha": trial.suggest_float("model__estimator__reg_alpha", 0.0, 1.0),
    }
    pipe = Pipeline([
        ("transformation", transform),
        ("model", OneVsRestClassifier(XGBClassifier()))
    ])
    pipe.set_params(**params)


    f1_scores = []

    for X_train_index, X_test_index in tqdm(loo.split(X_train), total=len(X_train)):
        X_train_loo, X_test_loo = X_train.iloc[X_train_index], X_train.iloc[X_test_index]
        y_train_loo, y_test_loo = y_train.iloc[X_train_index], y_train.iloc[X_test_index]
        pipe.fit(X_train_loo, y_train_loo)
        y_pred=pipe.predict(X_test_loo)
        f1 = f1_score(y_test_loo, y_pred, average='weighted', zero_division=0)
        f1_scores.append(f1)
    return sum(f1_scores)/ len(f1_scores)

study_xgb = optuna.create_study(direction="maximize")
study_xgb.optimize(objective_xgb, n_trials=1)
best_xgb = study_xgb.best_params
print(f"Лучшие параметры XGBoost: {best_xgb}")

[I 2025-07-12 22:19:55,843] A new study created in memory with name: no-name-3dbc2a90-56d7-400a-bb15-9e4d1a7edd97


  0%|          | 0/116 [00:00<?, ?it/s]

[I 2025-07-12 22:24:44,029] Trial 0 finished with value: 0.19827586206896552 and parameters: {'model__estimator__n_estimators': 1000, 'model__estimator__max_depth': 8, 'model__estimator__learning_rate': 0.01919062588884188, 'model__estimator__subsample': 0.5553779148569076, 'model__estimator__colsample_bytree': 0.37215765007869583, 'model__estimator__gamma': 0.15523299260114232, 'model__estimator__reg_alpha': 0.0980949442350838}. Best is trial 0 with value: 0.19827586206896552.


Лучшие параметры XGBoost: {'model__estimator__n_estimators': 1000, 'model__estimator__max_depth': 8, 'model__estimator__learning_rate': 0.01919062588884188, 'model__estimator__subsample': 0.5553779148569076, 'model__estimator__colsample_bytree': 0.37215765007869583, 'model__estimator__gamma': 0.15523299260114232, 'model__estimator__reg_alpha': 0.0980949442350838}


In [11]:
def report_metrics(mlflow_obj, y_true, y_prediction, y_probabilities, suffix_str, class_labels):
    accuracy = accuracy_score(y_true, y_prediction)
    balanced_accuracy = balanced_accuracy_score(y_true, y_prediction)
    precision = precision_score(y_true, y_prediction, average='weighted', zero_division=0)
    recall = recall_score(y_true, y_prediction, average='weighted', zero_division=0)
    f1 = f1_score(y_true, y_prediction, average='weighted', zero_division=0)
    
    cm = confusion_matrix(y_true, y_prediction)

    print(f"Accuracy_{suffix_str}: {accuracy}")
    print(f"Balanced Accuracy_{suffix_str}: {balanced_accuracy}")
    print(f"Precision (Weighted)_{suffix_str}: {precision}")
    print(f"Recall (Weighted)_{suffix_str}: {recall}")
    print(f"F1 Score (Weighted)_{suffix_str}: {f1}")
    
    mlflow_obj.log_metric(f"accuracy_{suffix_str}", accuracy)
    mlflow_obj.log_metric(f"balanced_accuracy_{suffix_str}", balanced_accuracy)
    mlflow_obj.log_metric(f"precision_weighted_{suffix_str}", precision)
    mlflow_obj.log_metric(f"recall_weighted_{suffix_str}", recall)
    mlflow_obj.log_metric(f"f1_weighted_{suffix_str}", f1)

    try:
        roc_auc_ovr_weighted = roc_auc_score(y_true, y_probabilities, multi_class='ovr', average='weighted')
        print(f"ROC AUC (OvR Weighted)_{suffix_str}: {roc_auc_ovr_weighted}")
        mlflow_obj.log_metric(f"roc_auc_ovr_weighted_{suffix_str}", roc_auc_ovr_weighted)

        roc_auc_ovr_macro = roc_auc_score(y_true, y_probabilities, multi_class='ovr', average='macro')
        print(f"ROC AUC (OvR Macro)_{suffix_str}: {roc_auc_ovr_macro}")
        mlflow_obj.log_metric(f"roc_auc_ovr_macro_{suffix_str}", roc_auc_ovr_macro)
    except ValueError as e:
        print(f"Не удалось рассчитать ROC AUC: {e}")
        mlflow_obj.log_metric(f"roc_auc_ovr_weighted_{suffix_str}", 0.0)
        mlflow_obj.log_metric(f"roc_auc_ovr_macro_{suffix_str}", 0.0)

    print(f"Confusion Matrix_{suffix_str}:\n", cm)
    mlflow_obj.log_text(f"Confusion Matrix (text):\n{cm}", f"confusion_matrix_text_{suffix_str}.txt")
    
    cm_for_plot = confusion_matrix(y_true, y_prediction, labels=range(len(class_labels)))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm_for_plot, display_labels=class_labels)
    fig_cm_width = max(8, len(class_labels) * 0.9)
    fig_cm_height = max(6, len(class_labels) * 0.7)
    fig_cm, ax_cm = plt.subplots(figsize=(fig_cm_width, fig_cm_height))
    disp.plot(cmap=plt.cm.Blues, ax=ax_cm, xticks_rotation='vertical')
    ax_cm.set_title(f"Общая матрица ошибок ({suffix_str})")
    plt.tight_layout()
    mlflow_obj.log_figure(fig_cm, f"confusion_matrix_overall_{suffix_str}.png")
    plt.close(fig_cm)

    n_classes = y_probabilities.shape[1]
    y_true_binarized = label_binarize(y_true, classes=range(n_classes))
    fpr = dict()
    tpr = dict()
    roc_auc_individual = dict()
    fig_roc, ax_roc = plt.subplots(figsize=(12, 10))
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_true_binarized[:, i], y_probabilities[:, i])
        roc_auc_individual[i] = auc(fpr[i], tpr[i])
        ax_roc.plot(fpr[i], tpr[i], lw=2, label=f'ROC класс {class_labels[i]} (AUC = {roc_auc_individual[i]:.2f})')
    fpr["micro"], tpr["micro"], _ = roc_curve(y_true_binarized.ravel(), y_probabilities.ravel())
    roc_auc_individual["micro"] = auc(fpr["micro"], tpr["micro"])
    ax_roc.plot(fpr["micro"], tpr["micro"],
                label=f'Micro-средняя ROC (AUC = {roc_auc_individual["micro"]:.2f})',
                color='deeppink', linestyle=':', linewidth=4)
    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
    mean_tpr = np.zeros_like(all_fpr)
    for i in range(n_classes):
        mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])
    mean_tpr /= n_classes
    fpr["macro"] = all_fpr
    tpr["macro"] = mean_tpr
    roc_auc_individual["macro"] = auc(fpr["macro"], tpr["macro"])
    ax_roc.plot(fpr["macro"], tpr["macro"],
                label=f'Macro-средняя ROC (AUC = {roc_auc_individual["macro"]:.2f})',
                color='navy', linestyle=':', linewidth=4)
    ax_roc.plot([0, 1], [0, 1], linestyle='--', lw=2, color='gray', label='Случайное предсказание')
    ax_roc.set_xlim([0.0, 1.0])
    ax_roc.set_ylim([0.0, 1.05])
    ax_roc.set_xlabel('False Positive Rate')
    ax_roc.set_ylabel('True Positive Rate')
    ax_roc.set_title(f'ROC-кривые One-vs-Rest с усреднением ({suffix_str})')
    ax_roc.legend(loc="lower right")
    ax_roc.grid(True)
    plt.tight_layout()
    mlflow_obj.log_figure(fig_roc, f"roc_curves_ovr_averages_{suffix_str}.png")
    plt.close(fig_roc)

def mlflow_run(model_type_name, pipeline_obj, params_dict, X_data_test, y_data_test, fitted_label_encoder):
    mlflow.log_params(params_dict)
    predict_start_time = time.time()
    y_predicted_values = pipeline_obj.predict(X_data_test)
    y_probability_values = pipeline_obj.predict_proba(X_data_test)
    predict_end_time = time.time()
    prediction_duration = predict_end_time - predict_start_time
    print(f"Время предсказания на тестовом наборе ({model_type_name}): {prediction_duration:.4f} секунд")
    mlflow.log_metric("prediction_time_test_seconds", prediction_duration)
    report_metrics(mlflow, y_data_test, y_predicted_values, y_probability_values, 
                   f"{model_type_name}_eval", class_labels=fitted_label_encoder.classes_)
    mlflow.sklearn.log_model(pipeline_obj, "model_pipeline")
    model_step_in_pipeline = pipeline_obj.named_steps['model']
    importances_values_array = None
    if hasattr(model_step_in_pipeline, 'estimators_'): 
        if all(hasattr(est, 'feature_importances_') for est in model_step_in_pipeline.estimators_):
            imp_sum_list = [est.feature_importances_ for est in model_step_in_pipeline.estimators_]
            importances_values_array = pd.DataFrame(imp_sum_list).mean(axis=0).values
    elif hasattr(model_step_in_pipeline, 'feature_importances_'): 
        importances_values_array = model_step_in_pipeline.feature_importances_
    if importances_values_array is not None:
        feature_names_list = pipeline_obj.named_steps['transformation'].get_feature_names_out()
        feat_imp_df = pd.DataFrame({
            'feature': feature_names_list,
            'importance': importances_values_array
        }).sort_values(by='importance', ascending=False)
        feat_imp_df = feat_imp_df[feat_imp_df['importance'] > 0].head(40)
        if not feat_imp_df.empty:
            total_importance_sum = feat_imp_df['importance'].sum()
            feat_imp_df['importance_fraction'] = feat_imp_df['importance'] / total_importance_sum if total_importance_sum > 0 else 0.0
            fig_fi_width = 25
            fig_fi_height = max(10, len(feat_imp_df) * 0.5) 
            fig_fi, ax_fi = plt.subplots(figsize=(fig_fi_width, fig_fi_height))
            bars = ax_fi.barh(feat_imp_df['feature'], feat_imp_df['importance_fraction'], color='skyblue')
            ax_fi.invert_yaxis()
            ax_fi.set_title(f"Важность признаков - {model_type_name}")
            ax_fi.set_xlabel("Важность (%)")
            ax_fi.set_ylabel("Признаки")
            ax_fi.xaxis.set_major_formatter(ticker.PercentFormatter(xmax=1.0))
            for bar_obj in bars:
                width_val = bar_obj.get_width()
                label_txt = f"{width_val:.1%}"
                ax_fi.text(width_val + 0.01, bar_obj.get_y() + bar_obj.get_height() / 2, label_txt, va='center')
            plt.tight_layout()
            mlflow.log_figure(fig_fi, f"feature_importances_{model_type_name}.png")
            plt.close(fig_fi)
            feat_imp_df.to_csv(f"feature_importances_{model_type_name}.csv", index=False)
            mlflow.log_artifact(f"feature_importances_{model_type_name}.csv")
        else:
            print(f"Не найдено признаков с важностью > 0 для {model_type_name}.")
    else:
        print(f"Не удалось получить важность признаков для {model_type_name}.")
    print(f"Запуск для {model_type_name} завершен. Все результаты залогированы в MLflow.")

In [12]:
pipe_xgb = Pipeline([
    ("transformation", transform),
    ("model", OneVsRestClassifier(XGBClassifier()))
])
if 'best_xgb' in locals() or 'best_xgb' in globals():
    pipe_xgb.set_params(**best_xgb)
else:
    print("Переменная best_xgb не найдена. Модель XGBoost будет использовать параметры по умолчанию.")

print("Начинаем обучение финальной модели XGBoost...")
train_start_time_xgb = time.time()
pipe_xgb.fit(X_train, y_train)
train_end_time_xgb = time.time()
final_training_time_xgb = train_end_time_xgb - train_start_time_xgb
print(f"Время обучения финальной модели XGBoost: {final_training_time_xgb:.4f} секунд")

Начинаем обучение финальной модели XGBoost...
Время обучения финальной модели XGBoost: 3.5567 секунд


In [13]:
mlflow.set_experiment("raw_data_analysis")
with mlflow.start_run(run_name="XGBoost_LOO_Optuna_Final"):
    mlflow.log_metric("final_model_training_time_seconds", final_training_time_xgb)
    params_to_log_xgb = best_xgb if 'best_xgb' in locals() or 'best_xgb' in globals() else {}
    mlflow_run("XGBoost", pipe_xgb, params_to_log_xgb, X_test, y_test, le)


Время предсказания на тестовом наборе (XGBoost): 0.0785 секунд
Accuracy_XGBoost_eval: 0.13793103448275862
Balanced Accuracy_XGBoost_eval: 0.08571428571428572
Precision (Weighted)_XGBoost_eval: 0.09865900383141762
Recall (Weighted)_XGBoost_eval: 0.13793103448275862
F1 Score (Weighted)_XGBoost_eval: 0.11362889983579638
ROC AUC (OvR Weighted)_XGBoost_eval: 0.5625767846457502
ROC AUC (OvR Macro)_XGBoost_eval: 0.5773931161431161
Confusion Matrix_XGBoost_eval:
 [[0 1 0 0 1 0 0 0]
 [0 0 0 0 0 0 0 3]
 [0 1 0 1 0 0 1 0]
 [0 1 0 0 0 0 2 0]
 [0 0 0 0 0 0 1 1]
 [0 0 0 1 0 0 2 1]
 [1 0 0 1 0 0 2 1]
 [0 3 1 0 0 0 1 2]]
Запуск для XGBoost завершен. Все результаты залогированы в MLflow.


In [12]:
# logged_model = 'runs:/aa735d26384d4e20aa803b9cf235e64f/model_pipeline'

# loaded_model = mlflow.pyfunc.load_model(logged_model)

# loaded_model.predict(X_test)

In [13]:
# model_uri = 'runs:/38064bcda94e4c9e9ffefacb80375816/model_pipeline'

In [14]:
# import mlflow
# logged_model = 'runs:/38064bcda94e4c9e9ffefacb80375816/model_pipeline'

# loaded_model = mlflow.pyfunc.load_model(logged_model)

# import pandas as pd
# loaded_model.predict(pd.DataFrame(df))

In [15]:
def objective_lgbm(trial):
    params = {
        "model__estimator__n_estimators": trial.suggest_int("model__estimator__n_estimators", 100, 200),
        "model__estimator__max_depth": trial.suggest_int("model__estimator__max_depth", 6, 12),
        "model__estimator__learning_rate": trial.suggest_float("model__estimator__learning_rate", 0.01, 0.3),
        "model__estimator__num_leaves": trial.suggest_int("model__estimator__num_leaves", 50, 1000),
        "model__estimator__subsample": trial.suggest_float("model__estimator__subsample", 0.6, 1.0),
        "model__estimator__colsample_bytree": trial.suggest_float("model__estimator__colsample_bytree", 0.6, 1.0),
        "model__estimator__reg_alpha": trial.suggest_float("model__estimator__reg_alpha", 0.0, 1.0),
        "model__estimator__reg_lambda": trial.suggest_float("model__estimator__reg_lambda", 0.0, 1.0),
    }
    pipe = Pipeline([
        ("transformation", transform),
        ("model", OneVsRestClassifier(LGBMClassifier(random_state=42, verbose=-1)))])
    pipe.set_params(**params)

    f1_scores = []
    for X_train_index, X_test_index in tqdm(loo.split(X_train), total=len(X_train)):
        X_train_loo, X_test_loo = X_train.iloc[X_train_index], X_train.iloc[X_test_index]
        y_train_loo, y_test_loo = y_train.iloc[X_train_index], y_train.iloc[X_test_index]
        pipe.fit(X_train_loo, y_train_loo)
        y_pred=pipe.predict(X_test_loo)
        f1 = f1_score(y_test_loo, y_pred, average='weighted', zero_division=0)
        f1_scores.append(f1)
    return sum(f1_scores)/ len(f1_scores)

study_lgbm = optuna.create_study(direction="maximize")
study_lgbm.optimize(objective_lgbm, n_trials=2)
best_lgbm = study_lgbm.best_params
print(f"Лучшие параметры LightGBM: {best_lgbm}")

[I 2025-07-05 18:01:29,695] A new study created in memory with name: no-name-5723bc59-cdeb-4dbc-80d4-8ca20625f637


  0%|          | 0/116 [00:00<?, ?it/s]

[I 2025-07-05 18:01:54,399] Trial 0 finished with value: 0.1810344827586207 and parameters: {'model__estimator__n_estimators': 120, 'model__estimator__max_depth': 8, 'model__estimator__learning_rate': 0.22489526269450463, 'model__estimator__num_leaves': 301, 'model__estimator__subsample': 0.7762744322949857, 'model__estimator__colsample_bytree': 0.6148306819842884, 'model__estimator__reg_alpha': 0.03389338861497715, 'model__estimator__reg_lambda': 0.9283311917025703}. Best is trial 0 with value: 0.1810344827586207.


  0%|          | 0/116 [00:00<?, ?it/s]

[I 2025-07-05 18:02:20,523] Trial 1 finished with value: 0.20689655172413793 and parameters: {'model__estimator__n_estimators': 181, 'model__estimator__max_depth': 12, 'model__estimator__learning_rate': 0.10771283412819901, 'model__estimator__num_leaves': 50, 'model__estimator__subsample': 0.7075364778592654, 'model__estimator__colsample_bytree': 0.9181113851771636, 'model__estimator__reg_alpha': 0.2147778090614838, 'model__estimator__reg_lambda': 0.8138733167481704}. Best is trial 1 with value: 0.20689655172413793.


Лучшие параметры LightGBM: {'model__estimator__n_estimators': 181, 'model__estimator__max_depth': 12, 'model__estimator__learning_rate': 0.10771283412819901, 'model__estimator__num_leaves': 50, 'model__estimator__subsample': 0.7075364778592654, 'model__estimator__colsample_bytree': 0.9181113851771636, 'model__estimator__reg_alpha': 0.2147778090614838, 'model__estimator__reg_lambda': 0.8138733167481704}


In [16]:
pipe_lgbm = Pipeline([
    ("transformation", transform),
    ("model", OneVsRestClassifier(LGBMClassifier(random_state=42)))
])
if 'best_lgbm' in locals() or 'best_lgbm' in globals():
    pipe_lgbm.set_params(**best_lgbm)
else:
    print("Переменная best_lgbm не найдена. Модель LightGBM будет использовать параметры по умолчанию.")

print("Начинаем обучение финальной модели LightGBM...")
train_start_time_lgbm = time.time()
pipe_lgbm.fit(X_train, y_train)
train_end_time_lgbm = time.time()
final_training_time_lgbm = train_end_time_lgbm - train_start_time_lgbm
print(f"Время обучения финальной модели LightGBM: {final_training_time_lgbm:.4f} секунд")

mlflow.set_experiment("raw_data_analysis")
with mlflow.start_run(run_name="LightGBM_LOO_Optuna_Final"):
    mlflow.log_metric("final_model_training_time_seconds", final_training_time_lgbm)
    params_to_log_lgbm = best_lgbm if 'best_lgbm' in locals() or 'best_lgbm' in globals() else {}
    mlflow_run("LightGBM", pipe_lgbm, params_to_log_lgbm, X_test, y_test, le)

Начинаем обучение финальной модели LightGBM...
Время обучения финальной модели LightGBM: 0.2506 секунд
Время предсказания на тестовом наборе (LightGBM): 0.0420 секунд
Accuracy_LightGBM_eval: 0.1724137931034483
Balanced Accuracy_LightGBM_eval: 0.10357142857142856
Precision (Weighted)_LightGBM_eval: 0.12988505747126436
Recall (Weighted)_LightGBM_eval: 0.1724137931034483
F1 Score (Weighted)_LightGBM_eval: 0.14788862253365298
ROC AUC (OvR Weighted)_LightGBM_eval: 0.5252262679848886
ROC AUC (OvR Macro)_LightGBM_eval: 0.5467082917082917
Confusion Matrix_LightGBM_eval:
 [[0 1 0 0 1 0 0 0]
 [0 0 0 0 0 0 0 3]
 [0 1 0 1 0 0 1 0]
 [0 1 0 0 0 0 2 0]
 [0 0 1 0 0 0 0 1]
 [0 1 0 1 0 0 0 2]
 [1 0 0 0 0 1 2 1]
 [0 3 0 0 0 0 1 3]]




Запуск для LightGBM завершен. Все результаты залогированы в MLflow.
🏃 View run LightGBM_LOO_Optuna_Final at: http://84.201.144.227:8000/#/experiments/1/runs/9b893b5b81524dd3b233c424d53545ff
🧪 View experiment at: http://84.201.144.227:8000/#/experiments/1


In [17]:
mlflow.set_experiment("raw_data_analysis")
with mlflow.start_run(run_name="LightGBM_LOO_Optuna_Final"):
    mlflow.log_metric("final_model_training_time_seconds", final_training_time_lgbm)
    params_to_log_lgbm = best_lgbm if 'best_lgbm' in locals() or 'best_lgbm' in globals() else {}
    mlflow_run("LightGBM", pipe_lgbm, params_to_log_lgbm, X_test, y_test, le)

Время предсказания на тестовом наборе (LightGBM): 0.0460 секунд
Accuracy_LightGBM_eval: 0.1724137931034483
Balanced Accuracy_LightGBM_eval: 0.10357142857142856
Precision (Weighted)_LightGBM_eval: 0.12988505747126436
Recall (Weighted)_LightGBM_eval: 0.1724137931034483
F1 Score (Weighted)_LightGBM_eval: 0.14788862253365298
ROC AUC (OvR Weighted)_LightGBM_eval: 0.5252262679848886
ROC AUC (OvR Macro)_LightGBM_eval: 0.5467082917082917
Confusion Matrix_LightGBM_eval:
 [[0 1 0 0 1 0 0 0]
 [0 0 0 0 0 0 0 3]
 [0 1 0 1 0 0 1 0]
 [0 1 0 0 0 0 2 0]
 [0 0 1 0 0 0 0 1]
 [0 1 0 1 0 0 0 2]
 [1 0 0 0 0 1 2 1]
 [0 3 0 0 0 0 1 3]]




Запуск для LightGBM завершен. Все результаты залогированы в MLflow.
🏃 View run LightGBM_LOO_Optuna_Final at: http://84.201.144.227:8000/#/experiments/1/runs/0ae2f19a6b864eb2b3be54eaa189a987
🧪 View experiment at: http://84.201.144.227:8000/#/experiments/1


In [18]:
def objective_cat(trial):
    params = {
        "model__estimator__iterations": trial.suggest_int("model__estimator__iterations", 300, 1000),
        "model__estimator__depth": trial.suggest_int("model__estimator__depth", 6, 10),
        "model__estimator__learning_rate": trial.suggest_float("model__estimator__learning_rate", 0.01, 0.3),
        "model__estimator__l2_leaf_reg": trial.suggest_float("model__estimator__l2_leaf_reg", 1.0, 10.0),
    }
    pipe = Pipeline([
        ("transformation", transform),
        ("model", OneVsRestClassifier(CatBoostClassifier(verbose=0)))
    ])
    pipe.set_params(**params)

    f1_scores = []
    for X_train_index, X_test_index in tqdm(loo.split(X_train), total=len(X_train)):
        X_train_loo, X_test_loo = X_train.iloc[X_train_index], X_train.iloc[X_test_index]
        y_train_loo, y_test_loo = y_train.iloc[X_train_index], y_train.iloc[X_test_index]
        pipe.fit(X_train_loo, y_train_loo)
        y_pred=pipe.predict(X_test_loo)
        f1 = f1_score(y_test_loo, y_pred, average='weighted', zero_division=0)
        f1_scores.append(f1)
    return sum(f1_scores)/ len(f1_scores)

study_cat = optuna.create_study(direction="maximize")
study_cat.optimize(objective_cat, n_trials=2)
best_cat = study_cat.best_params
print(f"Лучшие параметры CatBoost: {best_cat}")

[I 2025-07-05 18:02:37,315] A new study created in memory with name: no-name-d957e0f4-c4a1-4502-9d99-56174134b2c0


  0%|          | 0/116 [00:00<?, ?it/s]

[W 2025-07-05 18:07:32,223] Trial 0 failed with parameters: {'model__estimator__iterations': 728, 'model__estimator__depth': 10, 'model__estimator__learning_rate': 0.21512682435436242, 'model__estimator__l2_leaf_reg': 6.210684702203773} because of the following error: KeyboardInterrupt('').
Traceback (most recent call last):
  File "c:\Users\arutt\Desktop\ProjectClass\.venv\Lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\arutt\AppData\Local\Temp\ipykernel_3396\3892305994.py", line 18, in objective_cat
    pipe.fit(X_train_loo, y_train_loo)
  File "c:\Users\arutt\Desktop\ProjectClass\.venv\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\arutt\Desktop\ProjectClass\.venv\Lib\site-packages\sklearn\pipeline.py", line 662, in fit
    self._final_estimator.fit(Xt

KeyboardInterrupt: 

In [None]:
pipe_cat = Pipeline([
    ("transformation", transform),
    ("model", OneVsRestClassifier(CatBoostClassifier(verbose=0)))
])
if 'best_cat' in locals() or 'best_cat' in globals():
    pipe_cat.set_params(**best_cat)
else:
    print("Переменная best_cat не найдена. Модель CatBoost будет использовать параметры по умолчанию.")

print("Начинаем обучение финальной модели CatBoost...")
train_start_time_cat = time.time()
pipe_cat.fit(X_train, y_train)
train_end_time_cat = time.time()
final_training_time_cat = train_end_time_cat - train_start_time_cat
print(f"Время обучения финальной модели CatBoost: {final_training_time_cat:.4f} секунд")

mlflow.set_experiment("raw_data_analysis")
with mlflow.start_run(run_name="CatBoost_LOO_Optuna_Final"):
    mlflow.log_metric("final_model_training_time_seconds", final_training_time_cat)
    params_to_log_cat = best_cat if 'best_cat' in locals() or 'best_cat' in globals() else {}
    mlflow_run("CatBoost", pipe_cat, params_to_log_cat, X_test, y_test, le)

In [None]:
mlflow.set_experiment("raw_data_analysis")
with mlflow.start_run(run_name="CatBoost_LOO_Optuna_Final"):
    mlflow.log_metric("final_model_training_time_seconds", final_training_time_cat)
    params_to_log_cat = best_cat if 'best_cat' in locals() or 'best_cat' in globals() else {}
    mlflow_run("CatBoost", pipe_cat, params_to_log_cat, X_test, y_test, le)

In [None]:
models = {
    "XGBoost": pipe_xgb,
    "LightGBM": pipe_lgbm,
    "CatBoost": pipe_cat
}

In [None]:
def calculate_metrics(conf_matrix, class_idx, y_true=None, y_scores=None, beta=0.5):
    tp = conf_matrix[class_idx, class_idx]
    fn = conf_matrix[class_idx, :].sum() - tp
    fp = conf_matrix[:, class_idx].sum() - tp
    tn = conf_matrix.sum() - (tp + fp + fn)
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
    accuracy = (tp + tn) / (tp + tn + fp + fn) if (tp + tn + fp + fn) > 0 else 0
    f1 = 2 * tp / (2 * tp + fp + fn) if (2 * tp + fp + fn) > 0 else 0
    fbeta = (1 + beta**2) * precision * recall / (beta**2 * precision + recall) if (precision + recall) > 0 else 0
    balanced_acc = (recall + specificity) / 2
    metrics = {
        "Accuracy": accuracy,
        "Confusion Matrix": [[int(tp), int(fp)], [int(fn), int(tn)]],
        "Precision": precision,
        "Recall (Sensitivity)": recall,
        "Specificity": specificity,
        "FPR": fpr,
        "F1 Score": f1,
        f"F{int(beta)} Score": fbeta,
        "Balanced Accuracy": balanced_acc,
    }
    if y_true is not None and y_scores is not None:
        try:
            metrics["ROC-AUC"] = roc_auc_score(y_true, y_scores)
            fpr_arr, tpr_arr, thresholds = roc_curve(y_true, y_scores)
            metrics["ROC Curve"] = {
                "FPR": fpr_arr.tolist(),
                "TPR": tpr_arr.tolist(),
                "Thresholds": thresholds.tolist()
            }
        except Exception as e:
            metrics["ROC-AUC"] = None
            metrics["ROC Curve"] = {"error": str(e)}
    return metrics

In [None]:
results = {}

for model_name, model in models.items():
    print(f"Model: {model_name}")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    conf_mat = confusion_matrix(y_test, y_pred)
    print("Overall Confusion Matrix:")
    print(conf_mat)

    for class_idx in range(len(le.classes_)):
        print(f"Class {le.inverse_transform([class_idx])[0]} (label {class_idx}):")

        y_test_bin = (y_test == class_idx).astype(int)
        y_pred_bin = (y_pred == class_idx).astype(int)

        try:
            if hasattr(model.named_steps["model"], "predict_proba"):
                y_scores = model.predict_proba(X_test)[:, class_idx]
            else:
                y_scores = y_pred_bin
        except:
            y_scores = y_pred_bin

        metrics = calculate_metrics(conf_mat, class_idx, y_true=y_test_bin, y_scores=y_scores)
        results[(model_name, class_idx)] = metrics

        for k, v in metrics.items():
            if isinstance(v, dict):
                continue
            print(f"{k}: {v:.4f}" if isinstance(v, float) else f"{k}: {v}")

In [None]:
for model_name, model in models.items():
    print(f"Plotting One-vs-Rest ROC curves for model: {model_name}")
    model.fit(X_train, y_train)
    
    if not hasattr(model.named_steps["model"], "predict_proba"):
        print(f"Model {model_name} does not support probability predictions. Skipping ROC plots.")
        continue

    y_proba = model.predict_proba(X_test)
    n_classes = len(le.classes_)
    y_test_bin = label_binarize(y_test, classes=list(range(n_classes)))

    plt.figure(figsize=(12, 8))
    for class_idx in range(n_classes):
        fpr, tpr, _ = roc_curve(y_test_bin[:, class_idx], y_proba[:, class_idx])
        auc_score = roc_auc_score(y_test_bin[:, class_idx], y_proba[:, class_idx])
        class_label = le.inverse_transform([class_idx])[0]
        plt.plot(fpr, tpr, label=f"Class {class_label} vs Rest (AUC = {auc_score:.2f})")

    plt.plot([0, 1], [0, 1], 'k--', label="Random Guessing")
    plt.title(f"One-vs-Rest ROC Curves - {model_name}")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.legend(loc='lower right')
    plt.grid(True)
    plt.tight_layout()
    plt.show()

In [None]:
def evaluate_model(model, model_name, X_test, y_test, beta=2.0):
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)
    conf = confusion_matrix(y_test, y_pred, labels=list(range(len(le.classes_))))
    
    metrics_list = []

    for class_idx in range(len(le.classes_)):
        bin_true = (y_test == class_idx).astype(int)
        bin_score = y_proba[:, class_idx]

        metrics = calculate_metrics(
            conf_matrix=conf,
            class_idx=class_idx,
            y_true=bin_true,
            y_scores=bin_score,
            beta=beta
        )
        metrics["Model"] = model_name
        metrics["Class"] = le.inverse_transform([class_idx])[0]
        metrics_list.append(metrics)

    return pd.DataFrame(metrics_list)

all_metrics = pd.concat([
    evaluate_model(model, name, X_test, y_test)
    for name, model in models.items()
], ignore_index=True)


cols = ["Model", "Class", "Accuracy", "Precision", "Recall (Sensitivity)", "Specificity",
        "FPR", "F1 Score", "F2 Score", "Balanced Accuracy", "ROC-AUC"]
all_metrics = all_metrics[cols]

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.3f}'.format)

display(all_metrics.sort_values(by=["Model", "Class"]))

In [None]:
def plot_overall_confusion_matrix(model, model_name, X_test, y_test, class_labels):
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred, labels=range(len(class_labels)))

    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=class_labels, yticklabels=class_labels)
    plt.title(f'Overall Confusion Matrix - {model_name}')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.tight_layout()
    plt.show()

def plot_binary_confusion_matrix(model, model_name, X_test, y_test, class_idx, class_name):
    y_pred = model.predict(X_test)
    bin_true = (y_test == class_idx).astype(int)
    bin_pred = (y_pred == class_idx).astype(int)
    
    cm = confusion_matrix(bin_true, bin_pred)

    plt.figure(figsize=(4, 4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Oranges", cbar=False,
                xticklabels=["Not " + class_name, class_name],
                yticklabels=["Not " + class_name, class_name])
    plt.title(f'Binary Confusion Matrix - {model_name} - Class: {class_name}')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.tight_layout()
    plt.show()

for model_name, model in models.items():
    print(f"🔷 {model_name}")
    plot_overall_confusion_matrix(model, model_name, X_test, y_test, le.classes_)

    for class_idx, class_name in enumerate(le.classes_):
        plot_binary_confusion_matrix(model, model_name, X_test, y_test, class_idx, str(class_name))

In [None]:
def plot_roc_pr_curves(model_pipeline, X_data_test, y_data_test, model_type_name, fitted_label_encoder):
    y_score_probabilities = model_pipeline.predict_proba(X_data_test)
    n_model_classes = len(fitted_label_encoder.classes_)
    y_data_test_binarized = label_binarize(y_data_test, classes=range(n_model_classes))
    class_name_labels = fitted_label_encoder.classes_
    fpr_dict = dict()
    tpr_dict = dict()
    roc_auc_values_dict = dict()
    plt.figure(figsize=(12, 10))
    for i in range(n_model_classes):
        fpr_dict[i], tpr_dict[i], _ = roc_curve(y_data_test_binarized[:, i], y_score_probabilities[:, i])
        roc_auc_values_dict[i] = auc(fpr_dict[i], tpr_dict[i])
        plt.plot(fpr_dict[i], tpr_dict[i], lw=2, label=f'ROC класс {class_name_labels[i]} (AUC = {roc_auc_values_dict[i]:.2f})')
    fpr_dict["micro"], tpr_dict["micro"], _ = roc_curve(y_data_test_binarized.ravel(), y_score_probabilities.ravel())
    roc_auc_values_dict["micro"] = auc(fpr_dict["micro"], tpr_dict["micro"])
    plt.plot(fpr_dict["micro"], tpr_dict["micro"],
             label=f'Micro-средняя ROC (AUC = {roc_auc_values_dict["micro"]:.2f})',
             color='deeppink', linestyle=':', linewidth=4)
    all_unique_fpr = np.unique(np.concatenate([fpr_dict[i] for i in range(n_model_classes)]))
    mean_calculated_tpr = np.zeros_like(all_unique_fpr)
    for i in range(n_model_classes):
        mean_calculated_tpr += np.interp(all_unique_fpr, fpr_dict[i], tpr_dict[i])
    mean_calculated_tpr /= n_model_classes
    fpr_dict["macro"] = all_unique_fpr
    tpr_dict["macro"] = mean_calculated_tpr
    roc_auc_values_dict["macro"] = auc(fpr_dict["macro"], tpr_dict["macro"])
    plt.plot(fpr_dict["macro"], tpr_dict["macro"],
             label=f'Macro-средняя ROC (AUC = {roc_auc_values_dict["macro"]:.2f})',
             color='navy', linestyle=':', linewidth=4)
    plt.plot([0, 1], [0, 1], '--', color='gray', label='Случайное предсказание')
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title(f"ROC-кривая с усреднением - {model_type_name}")
    plt.legend(loc="lower right")
    plt.grid(True)
    plt.tight_layout()
    plt.show()
    plt.figure(figsize=(10, 8))
    for i in range(n_model_classes):
        precision_values, recall_values, _ = precision_recall_curve(y_data_test_binarized[:, i], y_score_probabilities[:, i])
        avg_precision = auc(recall_values, precision_values)
        plt.plot(recall_values, precision_values, lw=2, label=f'PR класс {class_name_labels[i]} (AP = {avg_precision:.2f})')
    plt.xlabel("Recall (Полнота)")
    plt.ylabel("Precision (Точность)")
    plt.title(f"Precision-Recall кривая - {model_type_name}")
    plt.legend(loc="best")
    plt.grid(True)
    plt.tight_layout()
    plt.show()

print("\n--- Построение ROC и PR кривых для XGBoost (отдельно) ---")
if 'pipe_xgb' in locals() and hasattr(pipe_xgb, "predict_proba"):
    plot_roc_pr_curves(pipe_xgb, X_test, y_test, "XGBoost", le)
else:
    print("Модель pipe_xgb не обучена или недоступна.")

print("\n--- Построение ROC и PR кривых для LightGBM (отдельно) ---")
if 'pipe_lgbm' in locals() and hasattr(pipe_lgbm, "predict_proba"):
    plot_roc_pr_curves(pipe_lgbm, X_test, y_test, "LightGBM", le)
else:
    print("Модель pipe_lgbm не обучена или недоступна.")

print("\n--- Построение ROC и PR кривых для CatBoost (отдельно) ---")
if 'pipe_cat' in locals() and hasattr(pipe_cat, "predict_proba"):
    plot_roc_pr_curves(pipe_cat, X_test, y_test, "CatBoost", le)
else:
    print("Модель pipe_cat не обучена или недоступна.")


mlflow server

In [None]:
logged_model = 'runs:/70d3629b68cc4109bb3f37a47ce4f1df/model_pipeline'

loaded_model = mlflow.pyfunc.load_model(logged_model)

In [None]:
joblib.dump(loaded_model, 'model.pkl')

In [None]:
joblib.load('model.pkl')