In [151]:
#!pip install catboost -q
#!pip install optuna -q

import pandas as pd
import numpy as np
import sklearn
import ast
import re
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
import lightgbm as lgb
from sklearn.model_selection import RandomizedSearchCV, cross_val_score
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import matplotlib.gridspec as gridspec
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_validate, train_test_split
import time
from sklearn.metrics import classification_report, roc_auc_score
import warnings

sklearn.set_config(transform_output="pandas")
warnings.filterwarnings('ignore')

In [104]:
df = pd.read_csv('nlp_tickets_train.csv', sep=';')
dt = pd.read_csv('nlp_tickets_test.csv', sep=';')
X_train, y_train = df["vector"], df["cluster"]
X_test, y_test = dt["vector"], dt["cluster"]

In [106]:
def vector_transform(data) -> np.ndarray:
    data = data.apply(ast.literal_eval)
    return np.vstack(data)

X_train = vector_transform(X_train)
X_test = vector_transform(X_test)

### Xgboost

In [110]:
import xgboost as xgb

yd_train = y_train - 1
yd_test = y_test - 1

params = {
    'objective': 'multi:softprob',
    'num_class': len(np.unique(yd_train)),
    'max_depth': 6,
    'learning_rate': 0.1,
    'eval_metric': "mlogloss" ,
    'random_seed': 42,
    'num_boost_round': 1000,
    'verbose_eval': 100
    }

dtrain = xgb.DMatrix(X_train, label=yd_train)
dtest = xgb.DMatrix(X_test, label=yd_test)

model = xgb.train(params, dtrain)
y_pred_proba = model.predict(dtest)

y_pred = np.argmax(y_pred_proba, axis=1)


roc_auc = roc_auc_score(yd_test, y_pred_proba, multi_class="ovr")

print(f'XGBoost roc_auc {roc_auc}')

XGBoost roc_auc 0.971614870462636


In [92]:
print("\nОтчет о классификации:\n", classification_report(yd_test, y_pred))


Отчет о классификации:
               precision    recall  f1-score   support

           0       0.98      0.97      0.98      3066
           1       0.72      0.49      0.58       133
           2       0.76      0.63      0.69       310
           3       0.73      0.84      0.78      1473
           4       0.73      0.70      0.71       362
           5       0.93      0.90      0.91       524
           6       0.87      0.73      0.79       255
           7       0.73      0.69      0.71       251
           8       0.90      0.87      0.88       897
           9       0.46      0.42      0.44       283

    accuracy                           0.86      7554
   macro avg       0.78      0.72      0.75      7554
weighted avg       0.86      0.86      0.86      7554



### Optuna + xgboost

In [111]:
import optuna

def objective(trial):

    params = {
        "objective": "multi:softprob",
        "eval_metric": "mlogloss",
        "num_class": len(np.unique(y_train)),
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.3),
        "max_depth": trial.suggest_int("max_depth", 4, 10),
        "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
        "n_estimators": trial.suggest_int("n_estimators", 500, 2000),
        "gamma": trial.suggest_loguniform("gamma", 1e-5, 1),
        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 1),
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-5, 1),
        "random_state": 42
    }

    model = xgb.XGBClassifier(**params)
    model.fit(Xtrain, ytrain, eval_set=[(Xval, yval)], verbose=False)

    y_pred_proba = model.predict_proba(Xval)
    return roc_auc_score(yval, y_pred_proba, multi_class="ovr")

Xtrain, Xval, ytrain, yval = train_test_split(X_train, yd_train, test_size=0.2, random_state=42)

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)
best_params = study.best_params
print('best_params', best_params)

[I 2025-03-14 10:15:52,869] A new study created in memory with name: no-name-3be257ce-d20b-4d55-b6fc-73898e41eb61
[I 2025-03-14 10:16:19,945] Trial 0 finished with value: 0.9732796340119734 and parameters: {'learning_rate': 0.014238394391252926, 'max_depth': 4, 'subsample': 0.7098628614175935, 'colsample_bytree': 0.9669561315802901, 'n_estimators': 622, 'gamma': 7.804062040569442e-05, 'reg_alpha': 0.000570630573744654, 'reg_lambda': 0.06913576231446755}. Best is trial 0 with value: 0.9732796340119734.
[I 2025-03-14 10:16:59,706] Trial 1 finished with value: 0.9730456393393616 and parameters: {'learning_rate': 0.012417218917014142, 'max_depth': 5, 'subsample': 0.7345540302395357, 'colsample_bytree': 0.9553019174138307, 'n_estimators': 741, 'gamma': 4.096595618745277e-05, 'reg_alpha': 0.0005528450154653273, 'reg_lambda': 0.000172775286530944}. Best is trial 0 with value: 0.9732796340119734.
[I 2025-03-14 10:17:47,154] Trial 2 finished with value: 0.9721370895880991 and parameters: {'lear

best_params {'learning_rate': 0.013839438293301354, 'max_depth': 7, 'subsample': 0.7000079378576259, 'colsample_bytree': 0.8690556604536873, 'n_estimators': 1292, 'gamma': 0.08192981009850758, 'reg_alpha': 0.0005572177818439023, 'reg_lambda': 0.44182409910719056}


In [122]:
best_params.update({
    "objective": "multi:softprob",
    "eval_metric": "mlogloss",
    "num_class": len(np.unique(y_train))
})

In [123]:
model = xgb.train(
    params=best_params,
    dtrain=dtrain
)

y_pred_proba = model.predict(dtest)

y_pred = np.argmax(y_pred_proba, axis=1)


roc_auc = roc_auc_score(yd_test, y_pred_proba, multi_class="ovr")

print(f'XGBoost roc_auc {roc_auc}')

XGBoost roc_auc 0.9682486853623807


In [124]:
print("\nОтчет о классификации:\n", classification_report(yd_test, y_pred))


Отчет о классификации:
               precision    recall  f1-score   support

           0       0.98      0.98      0.98      3066
           1       0.71      0.44      0.55       133
           2       0.81      0.62      0.70       310
           3       0.74      0.87      0.80      1473
           4       0.75      0.72      0.73       362
           5       0.91      0.90      0.91       524
           6       0.81      0.70      0.75       255
           7       0.83      0.73      0.78       251
           8       0.89      0.88      0.89       897
           9       0.52      0.42      0.46       283

    accuracy                           0.86      7554
   macro avg       0.79      0.73      0.75      7554
weighted avg       0.86      0.86      0.86      7554



### Lightgbm

In [147]:
import lightgbm as lgb

train_lgb = lgb.Dataset(X_train, label=yd_train)
test_lgb = lgb.Dataset(X_test, label=yd_test, reference=train_lgb)

params = {
    "objective": "multiclass",
    "metric": "multi_logloss",
    "num_class": len(np.unique(y_train)),
    "learning_rate": 0.05,
    "max_depth": 6,
    "num_leaves": 64,
    "subsample": 0.7,
    "colsample_bytree": 0.8,
    "n_estimators": 1000,
    "verbose": -1
}

model = lgb.train(params, train_lgb, num_boost_round=1000, valid_sets=[test_lgb])
y_pred_proba = model.predict(X_test)

y_pred = np.argmax(y_pred_proba, axis=1)
roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class="ovr")

print(f'LightGBM roc_auc {roc_auc}')


LightGBM roc_auc 0.9818035359569475


In [148]:
print("\nОтчет о классификации:\n", classification_report(yd_test, y_pred))


Отчет о классификации:
               precision    recall  f1-score   support

           0       0.97      0.98      0.98      3066
           1       0.67      0.59      0.63       133
           2       0.78      0.68      0.73       310
           3       0.77      0.85      0.81      1473
           4       0.77      0.73      0.75       362
           5       0.93      0.92      0.92       524
           6       0.81      0.80      0.81       255
           7       0.85      0.75      0.79       251
           8       0.90      0.89      0.90       897
           9       0.55      0.46      0.50       283

    accuracy                           0.87      7554
   macro avg       0.80      0.76      0.78      7554
weighted avg       0.87      0.87      0.87      7554



In [143]:
Xtrain, Xval, ytrain, yval = train_test_split(X_train, yd_train, test_size=0.2, random_state=42)

def objective(trial):
    params = {
        "objective": "multiclass",
        "metric": "multi_logloss",
        "num_class": len(np.unique(ytrain)),
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.3),
        "max_depth": trial.suggest_int("max_depth", 4, 10),
        "num_leaves": trial.suggest_int("num_leaves", 16, 128),
        "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
        "n_estimators": trial.suggest_int("n_estimators", 500, 2000),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 5, 50),
        "min_gain_to_split": trial.suggest_loguniform("min_gain_to_split", 1e-5, 1),
        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 1),
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-5, 1),
        "feature_fraction": trial.suggest_uniform("feature_fraction", 0.5, 1.0),
        "bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.5, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 10),
        "verbose": -1,
        "random_state": 42
    }

    model = lgb.LGBMClassifier(**params)
    model.fit(Xtrain, ytrain, eval_set=[(Xval, yval)], eval_metric="multi_logloss")

    y_pred_proba = model.predict_proba(Xval)
    return roc_auc_score(yval, y_pred_proba, multi_class="ovr")

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

best_params = study.best_params
print("Лучшие параметры:", best_params)

[I 2025-03-14 12:29:06,411] A new study created in memory with name: no-name-8354df08-0ca2-463d-90f7-2a1974d91ac0
[I 2025-03-14 12:29:35,705] Trial 0 finished with value: 0.9704236297607292 and parameters: {'learning_rate': 0.18280316778462588, 'max_depth': 4, 'num_leaves': 102, 'subsample': 0.738101338366429, 'colsample_bytree': 0.8550853112705568, 'n_estimators': 1194, 'min_data_in_leaf': 34, 'min_gain_to_split': 0.00010698949447368086, 'reg_alpha': 0.00016855930762693706, 'reg_lambda': 0.7770180697412187, 'feature_fraction': 0.9027797017288843, 'bagging_fraction': 0.5905272071453982, 'bagging_freq': 5}. Best is trial 0 with value: 0.9704236297607292.
[I 2025-03-14 12:29:54,834] Trial 1 finished with value: 0.9741385376434298 and parameters: {'learning_rate': 0.034854403051285655, 'max_depth': 8, 'num_leaves': 119, 'subsample': 0.7292480170077216, 'colsample_bytree': 0.7741305628236206, 'n_estimators': 967, 'min_data_in_leaf': 5, 'min_gain_to_split': 1.23054149946957e-05, 'reg_alpha'

Лучшие параметры: {'learning_rate': 0.0710199032375748, 'max_depth': 9, 'num_leaves': 85, 'subsample': 0.6709572617675066, 'colsample_bytree': 0.7209379386470598, 'n_estimators': 1315, 'min_data_in_leaf': 16, 'min_gain_to_split': 0.002276948341835634, 'reg_alpha': 1.004413011993661e-05, 'reg_lambda': 2.9196525907592787e-05, 'feature_fraction': 0.65163101387557, 'bagging_fraction': 0.999141054404916, 'bagging_freq': 5}


In [149]:
train_lgb = lgb.Dataset(X_train, label=yd_train)
test_lgb = lgb.Dataset(X_test, label=yd_test, reference=train_lgb)

best_params.update({"objective": "multiclass", "metric": "multi_logloss", "num_class": len(np.unique(y_train)), "verbose": -1})

model = lgb.train(best_params, train_lgb, num_boost_round=1000, valid_sets=[test_lgb])

y_pred_proba = model.predict(X_test)
y_pred = np.argmax(y_pred_proba, axis=1)
roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class="ovr")

print(f"LightGBM optuna optmized ROC AUC: {roc_auc:.4f}")

LightGBM optuna optmized ROC AUC: 0.9822


In [150]:
print("\nОтчет о классификации:\n", classification_report(yd_test, y_pred))


Отчет о классификации:
               precision    recall  f1-score   support

           0       0.98      0.98      0.98      3066
           1       0.69      0.53      0.60       133
           2       0.82      0.66      0.73       310
           3       0.75      0.87      0.81      1473
           4       0.78      0.72      0.75       362
           5       0.93      0.91      0.92       524
           6       0.83      0.78      0.81       255
           7       0.86      0.75      0.80       251
           8       0.90      0.88      0.89       897
           9       0.55      0.45      0.49       283

    accuracy                           0.87      7554
   macro avg       0.81      0.75      0.78      7554
weighted avg       0.87      0.87      0.87      7554



### Выводы

В дополнение к catboost, SVC и логистической регрессии опробованы бустинги:


XGBoost \
Оптимизированный XGBoost  с Optuna\
Оценка моделей выполнена с помощью метрик ROC AUC, precision, recall, f1-score и accuracy.

- Logistic Regression: 0.908
- SVC: 0.935
- CatBoost: 0.984
- Optuna + CatBoost: 0.982
- XGBoost:  0.971
- Optuna + XGBoost: 0.968
- Lightgbm: 0.981
- Optuna + Lightgbm: 0.982\
CatBoost и его оптимизированная версия показали лучшие результаты по метрике ROC AUC, существенно превосходя линейные модели и немного превосходя другие бустинги.

Сравнение по метрике accuracy (доля правильных ответов)
- Logistic Regression: 0.70
- SVC: 0.75
- CatBoost: 0.88
- Optuna + CatBoost: 0.88
- XGBoost: 0.86
- Optuna + XGBoost: 0.86
- Lightgbm: 0.87
- Optuna + Lightgbm: 0.87\
CatBoost и его оптимизированная версия также показывают наивысшую точность.


Оптимизация несущественно улучшила метрики для LGBM, для XGBoost не дала значимого прироста.

Catboost остался лучшим по качеству бустингом, чуть хуже сработал LightGBM, после него - XGBoost.