In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier

In [3]:
train_df = pd.read_parquet("train_data.pqt")
test_df = pd.read_parquet("test_data.pqt")

In [None]:
# y_train.value_counts().plot(kind='bar')

#### target class - "end_cluster" (df[-1])

In [4]:
cat_cols = [
    "channel_code", "city", "city_type",
    "okved", "segment", "start_cluster",
    "index_city_code", "ogrn_month", "ogrn_year",
]

cat_cols_num = []

In [5]:
for x,y in zip(test_df.columns, test_df.isna().sum()):
    print(x, "---", y)

id --- 0
date --- 0
balance_amt_avg --- 29483
balance_amt_max --- 29483
balance_amt_min --- 29483
balance_amt_day_avg --- 29483
channel_code --- 20028
city --- 60612
city_type --- 60992
index_city_code --- 156461
ogrn_days_end_month --- 14278
ogrn_days_end_quarter --- 14278
ogrn_month --- 14278
ogrn_year --- 14278
ft_registration_date --- 14838
max_founderpres --- 161192
min_founderpres --- 161192
ogrn_exist_months --- 14278
okved --- 14479
segment --- 14020
sum_of_paym_2m --- 73461
sum_of_paym_6m --- 73461
sum_of_paym_1y --- 73461
sum_a_oper_1m --- 0
cnt_a_oper_1m --- 95210
sum_b_oper_1m --- 0
cnt_b_oper_1m --- 95210
sum_c_oper_1m --- 0
cnt_c_oper_1m --- 95210
sum_deb_d_oper_1m --- 0
cnt_deb_d_oper_1m --- 95210
sum_cred_d_oper_1m --- 0
cnt_cred_d_oper_1m --- 95210
sum_deb_e_oper_1m --- 0
cnt_deb_e_oper_1m --- 95210
cnt_days_deb_e_oper_1m --- 95210
sum_cred_e_oper_1m --- 0
cnt_cred_e_oper_1m --- 95210
cnt_days_cred_e_oper_1m --- 95210
sum_deb_f_oper_1m --- 0
cnt_deb_f_oper_1m --- 95210

Создаем выборки для валидации и обучения

In [27]:
to_drop_cols = [
    "date",
    'cnt_a_oper_1m', "cnt_b_oper_1m", 'cnt_c_oper_1m', 'cnt_deb_d_oper_1m','cnt_cred_d_oper_1m','cnt_deb_e_oper_1m','cnt_days_deb_e_oper_1m','cnt_cred_e_oper_1m','cnt_days_cred_e_oper_1m',
    'cnt_deb_f_oper_1m','cnt_days_deb_f_oper_1m','cnt_cred_f_oper_1m','cnt_days_cred_f_oper_1m','cnt_deb_g_oper_1m',
    'cnt_days_deb_g_oper_1m','cnt_cred_g_oper_1m','cnt_days_cred_g_oper_1m','cnt_deb_h_oper_1m','cnt_days_deb_h_oper_1m','cnt_cred_h_oper_1m','cnt_days_cred_h_oper_1m',
    'cnt_days_cred_h_oper_3m','cnt_cred_h_oper_3m','cnt_days_deb_h_oper_3m','cnt_deb_h_oper_3m','cnt_days_cred_g_oper_3m','cnt_cred_g_oper_3m','cnt_days_deb_g_oper_3m','cnt_deb_g_oper_3m','cnt_days_cred_f_oper_3m','cnt_cred_f_oper_3m','cnt_days_deb_f_oper_3m','cnt_deb_f_oper_3m','cnt_days_cred_e_oper_3m','cnt_cred_e_oper_3m','cnt_days_deb_e_oper_3m','cnt_deb_e_oper_3m','cnt_cred_d_oper_3m','cnt_deb_d_oper_3m','cnt_c_oper_3m','cnt_b_oper_3m','cnt_a_oper_3m',

    'sum_of_paym_2m', 'sum_of_paym_6m', 'sum_of_paym_1y',

    'balance_amt_max','balance_amt_min','balance_amt_day_avg','index_city_code',

    'ogrn_days_end_month', 'ogrn_days_end_quarter','ogrn_month','ogrn_year','max_founderpres','min_founderpres','ogrn_exist_months'
]
to_drop_cols.append("id")

In [28]:
raw_X = train_df.drop(to_drop_cols, axis=1)

raw_X['balance_amt_avg'] = raw_X['balance_amt_avg'].fillna(raw_X['balance_amt_avg'].mean())
raw_X['channel_code'] = raw_X['channel_code'].fillna('channel_code_8')
raw_X['city']= raw_X['city'].fillna('city_0')
raw_X['city_type']= raw_X['city_type'].fillna('city_type_0')
raw_X['ft_registration_date'] = raw_X['ft_registration_date'].fillna(0.407456)
raw_X['okved']= raw_X['okved'].fillna('okved_5')
raw_X['segment']= raw_X['segment'].fillna('segment_3')

y = raw_X["end_cluster"]
raw_X = raw_X.drop(['end_cluster'],axis=1)

x_train, x_val, y_train, y_val = train_test_split(raw_X, y,
                                                  test_size=0.2,
                                                  random_state=42)


In [29]:
for i, col in enumerate(cat_cols):
    for j, col1 in enumerate(x_train.columns):
        if col == col1:
            cat_cols_num.append(j)
cat_cols_num = list(set(sorted(cat_cols_num)))
cat_cols_num

[1, 2, 3, 33, 5, 6]

In [15]:
model = CatBoostClassifier(iterations=20,
                           depth=8,
                           learning_rate=0.1,
                        #    loss_function='AUC',
                           custom_loss=['AUC', 'MultiClass'],
                           verbose=True,
                           # random_state=1337,
                           random_seed=228)
# train the model
model.fit(x_train, y_train,
        cat_features=cat_cols_num,
        eval_set=(x_val, y_val),
        verbose=True,
)
# make the prediction using the resulting model
# preds_class = model.predict(x_val)
preds_proba = model.predict_proba(x_val)
# print("class = ", preds_class)
print("proba = ", preds_proba)

0:	learn: 2.0658283	test: 2.0643786	best: 2.0643786 (0)	total: 15.4s	remaining: 4m 52s
1:	learn: 1.8136090	test: 1.8119959	best: 1.8119959 (1)	total: 28.3s	remaining: 4m 14s
2:	learn: 1.6472885	test: 1.6452767	best: 1.6452767 (2)	total: 38.1s	remaining: 3m 36s
3:	learn: 1.5240514	test: 1.5220126	best: 1.5220126 (3)	total: 46.5s	remaining: 3m 5s
4:	learn: 1.4296771	test: 1.4276396	best: 1.4276396 (4)	total: 54.4s	remaining: 2m 43s
5:	learn: 1.3523349	test: 1.3508182	best: 1.3508182 (5)	total: 1m 2s	remaining: 2m 26s
6:	learn: 1.2886374	test: 1.2876006	best: 1.2876006 (6)	total: 1m 10s	remaining: 2m 10s
7:	learn: 1.2362364	test: 1.2355897	best: 1.2355897 (7)	total: 1m 18s	remaining: 1m 57s
8:	learn: 1.1929771	test: 1.1926653	best: 1.1926653 (8)	total: 1m 25s	remaining: 1m 44s
9:	learn: 1.1553418	test: 1.1554127	best: 1.1554127 (9)	total: 1m 37s	remaining: 1m 37s
10:	learn: 1.1225904	test: 1.1229660	best: 1.1229660 (10)	total: 1m 48s	remaining: 1m 28s
11:	learn: 1.0942862	test: 1.0949346	

#### Eval

Зададим функцию для взвешенной метрики roc auc

In [30]:
def weighted_roc_auc(y_true, y_pred, labels, weights_dict):
    unnorm_weights = np.array([weights_dict[label] for label in labels])
    weights = unnorm_weights / unnorm_weights.sum()
    classes_roc_auc = roc_auc_score(y_true, y_pred, labels=labels,
                                    multi_class="ovr", average=None)
    return sum(weights * classes_roc_auc)

In [31]:
cluster_weights = pd.read_excel("cluster_weights.xlsx").set_index("cluster")
weights_dict = cluster_weights["unnorm_weight"].to_dict()

In [32]:
y_pred_proba = model.predict_proba(x_val)
weighted_roc_auc(y_val, y_pred_proba, model.classes_, weights_dict)

0.8755371643607294

#### Sub

In [33]:
sample_submission_df = pd.read_csv("sample_submission.csv")

test_df["start_cluster"] = train_df["start_cluster"].mode()[0]

test_df['balance_amt_avg'] = test_df['balance_amt_avg'].fillna(test_df['balance_amt_avg'].mean())
test_df['channel_code'] = test_df['channel_code'].fillna('channel_code_8')
test_df['city']= test_df['city'].fillna('city_0')
test_df['city_type']= test_df['city_type'].fillna('city_type_0')
test_df['ft_registration_date'] = test_df['ft_registration_date'].fillna(0.5)
test_df['okved']= test_df['okved'].fillna('okved_5')
test_df['segment']= test_df['segment'].fillna('segment_3')

last_m_test_df = test_df[test_df["date"] == "month_6"]
last_m_test_df = last_m_test_df.drop(to_drop_cols, axis=1)

In [34]:
test_pred_proba = model.predict_proba(last_m_test_df)
test_pred_proba_df = pd.DataFrame(test_pred_proba, columns=model.classes_)
sorted_classes = test_pred_proba_df.columns.to_list()
test_pred_proba_df = test_pred_proba_df[sorted_classes]

In [37]:
test_pred_proba_df['id'] = test_df['id']

sample_submission_df[sorted_classes] = test_pred_proba_df
sample_submission_df.to_csv("sub.csv", index=False)

ValueError: Columns must be same length as key