In [7]:
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

In [8]:
# Параметры для CatBoostClassifier
params = {
    'iterations': 500,             
    'learning_rate': 0.1,         
    'depth': 6,                  
    'verbose': 100,              
    'random_seed': 42,
    'thread_count': -1,          
    'early_stopping_rounds': 50    
}

In [9]:
# Загрузка данных из файлов
train_df = pd.read_parquet("train_data.pqt")
test_df = pd.read_parquet("test_data.pqt")

In [10]:
# Список категориальных признаков
cat_cols = [
    "channel_code", "city", "city_type",
    "okved", "segment", "start_cluster",
    "index_city_code", "ogrn_month", "ogrn_year",
]
# Преобразование категориальных признаков в строки
for col in cat_cols:
    train_df[col] = train_df[col].astype(str)
    test_df[col] = test_df[col].astype(str)

# Разделение признаков и целевой переменной в обучающем наборе данных
X = train_df.drop(["id", "date", "end_cluster"], axis=1)
y = train_df["end_cluster"]

# Разделение данных на обучающую и валидационную выборки
x_train, x_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Создание и настройка модели CatBoostClassifier
model = CatBoostClassifier(**params)

train_pool = Pool(x_train, y_train, cat_features=cat_cols)
val_pool = Pool(x_val, y_val, cat_features=cat_cols)

model.fit(train_pool, eval_set=val_pool, plot=True)  

In [None]:
# Функция для вычисления ROC AUC
def weighted_roc_auc(y_true, y_pred, labels, weights_dict):
    unnorm_weights = np.array([weights_dict[label] for label in labels])
    weights = unnorm_weights / unnorm_weights.sum()
    classes_roc_auc = roc_auc_score(y_true, y_pred, labels=labels,
                                    multi_class="ovr", average=None)
    return sum(weights * classes_roc_auc)

In [None]:
# Загрузка весов кластеров
cluster_weights = pd.read_excel("cluster_weights.xlsx").set_index("cluster")
weights_dict = cluster_weights["unnorm_weight"].to_dict()

In [None]:
# Получение вероятностей предсказаний для валидационной выборки
y_pred_proba = model.predict_proba(x_val)
y_pred_proba.shape

# Вычисление взвешенного ROC AUC для валидационной выборки
weighted_roc_auc(y_val, y_pred_proba, model.classes_, weights_dict)

0.9659211303195099

In [None]:
# Заполнение столбца start_cluster в тестовом наборе данных модальным значением из обучающего набора данных
test_df["start_cluster"] = train_df["start_cluster"].mode()[0]
test_df["start_cluster"] = test_df["start_cluster"].astype("category")

In [None]:
# Загрузка файла шаблона для отправки результатов
sample_submission_df = pd.read_csv("sample_submission.csv")

In [None]:
# Фильтрация данных тестового набора для последнего месяца
last_m_test_df = test_df[test_df["date"] == "month_6"]
last_m_test_df = last_m_test_df.drop(["id", "date"], axis=1)

In [None]:
# Получение вероятностей предсказаний для данных последнего месяца
test_pred_proba = model.predict_proba(last_m_test_df)
test_pred_proba_df = pd.DataFrame(test_pred_proba, columns=model.classes_)

# Сортировка столбцов предсказаний в алфавитном порядке
sorted_classes = sorted(test_pred_proba_df.columns.to_list())
test_pred_proba_df = test_pred_proba_df[sorted_classes]

In [None]:
# Вывод первых двух строк предсказаний для проверки
test_pred_proba_df.head(2)

Unnamed: 0,{other},{},"{α, β}","{α, γ}","{α, δ}","{α, ε, η}","{α, ε, θ}","{α, ε, ψ}","{α, ε}","{α, η}","{α, θ}","{α, λ}","{α, μ}","{α, π}","{α, ψ}",{α},{λ}
0,0.011003,0.028593,0.018889,0.020912,0.005991,0.00034,0.002162,0.000738,0.01824,0.006183,0.018691,0.000465,0.002234,1.1e-05,0.003691,0.861849,7e-06
1,0.012663,0.51962,0.000964,0.003128,0.000671,0.00028,0.000862,6.4e-05,0.003343,0.021955,0.003699,0.00034,0.001407,2e-06,0.001209,0.4296,0.000192


In [None]:
# Заполнение файла шаблона предсказаниями и сохранение результата в CSV файл
sample_submission_df[sorted_classes] = test_pred_proba_df
sample_submission_df.to_csv("test/test.csv", index=False)