In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Модель

In [None]:
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

In [None]:
!pip freeze | grep "numpy\|pandas\|lightgbm\|scikit-learn"

geopandas==0.13.2
lightgbm==4.1.0
numpy==1.25.2
pandas==1.5.3
pandas-datareader==0.10.0
pandas-gbq==0.19.2
pandas-stubs==1.5.3.230304
scikit-learn==1.2.2
sklearn-pandas==2.2.0


## Загрузка данных

In [None]:
train_df = pd.read_parquet("/content/drive/MyDrive/train_data.pqt")
test_df = pd.read_parquet("/content/drive/MyDrive/test_data.pqt")

In [None]:
train_df.head(3)

Unnamed: 0,id,date,balance_amt_avg,balance_amt_max,balance_amt_min,balance_amt_day_avg,channel_code,city,city_type,index_city_code,...,cnt_cred_g_oper_3m,cnt_days_cred_g_oper_3m,sum_deb_h_oper_3m,cnt_deb_h_oper_3m,cnt_days_deb_h_oper_3m,sum_cred_h_oper_3m,cnt_cred_h_oper_3m,cnt_days_cred_h_oper_3m,start_cluster,end_cluster
0,0,month_1,0.744845,0.705492,1.287207,0.748101,channel_code_5,city_23,city_type_0,index_city_code_39,...,0.951166,0.568681,0.897565,0.553624,0.774354,0.936506,0.295984,0.967947,"{α, γ}",{other}
1,0,month_2,1.049605,0.831916,2.458609,1.053805,channel_code_5,city_23,city_type_0,index_city_code_39,...,0.948812,0.499716,0.785029,0.551904,0.696576,0.990157,0.298873,0.945969,"{α, γ}",{other}
2,0,month_3,0.692653,0.740253,0.430042,0.695747,channel_code_5,city_23,city_type_0,index_city_code_39,...,0.946458,0.442244,0.87705,0.551044,0.663243,0.810065,0.294829,0.956958,"{α, γ}",{other}


In [None]:
test_df.head(3)

Unnamed: 0,id,date,balance_amt_avg,balance_amt_max,balance_amt_min,balance_amt_day_avg,channel_code,city,city_type,index_city_code,...,sum_cred_g_oper_3m,cnt_cred_g_oper_3m,cnt_days_cred_g_oper_3m,sum_deb_h_oper_3m,cnt_deb_h_oper_3m,cnt_days_deb_h_oper_3m,sum_cred_h_oper_3m,cnt_cred_h_oper_3m,cnt_days_cred_h_oper_3m,start_cluster
0,200000,month_4,-0.096224,0.335496,-0.125995,-0.095578,channel_code_12,city_14,city_type_0,,...,0.010952,0.946066,0.407762,-0.15395,0.548895,0.54102,0.031742,0.257278,0.561353,{α}
1,200000,month_5,-0.024255,-0.059806,-0.124295,-0.023381,channel_code_12,city_14,city_type_0,,...,0.006812,0.945281,0.396267,-0.150505,0.549468,0.552131,0.237817,0.264211,0.715199,{α}
2,200000,month_6,0.045988,0.049418,-0.125995,0.047079,channel_code_12,city_14,city_type_0,,...,0.006812,0.945281,0.396267,-0.1528,0.549468,0.54102,0.387566,0.268543,0.836079,


Обозначение категориальных признаков

In [None]:
cat_cols = [
    "channel_code", "city", "city_type",
    "okved", "segment", "start_cluster",
    "index_city_code", "ogrn_month", "ogrn_year",
]

In [None]:
train_df[cat_cols] = train_df[cat_cols].astype("category")
test_df[cat_cols] = test_df[cat_cols].astype("category")

## Подготовка данных

Работа с пропусками

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder, Normalizer, MinMaxScaler, RobustScaler

In [None]:
threshold = len(train_df.columns) / 2.1
train_df = train_df.dropna(thresh=threshold)

In [None]:
numeric_cols = train_df.select_dtypes(include=np.number).columns
train_df[numeric_cols] = train_df[numeric_cols].fillna(train_df[numeric_cols].mean())

In [None]:
cat_cols_missing = [col for col in cat_cols if train_df[col].isnull().any()]
if cat_cols_missing:
    imp_mode = SimpleImputer(strategy='most_frequent')
    train_df[cat_cols_missing] = imp_mode.fit_transform(train_df[cat_cols_missing])

Кодирование категориальных признаков (ECF)

In [None]:
enc = LabelEncoder()

le = {}
for col in cat_cols:
  le[col] = enc
  train_df[col] = le[col].fit_transform(train_df[col])

Создаем выборки для валидации и обучения

In [None]:
X = train_df.drop(["id", "date", "end_cluster"], axis=1)
y = train_df["end_cluster"]

In [None]:
x_train, x_val, y_train, y_val = train_test_split(X, y,
                                                  test_size=0.3,
                                                  random_state=42)

In [None]:
y_train

248354        {α}
435382        {α}
468717        {α}
344462        {α}
229137         {}
           ...   
123193        {α}
289475     {α, η}
408483        {α}
147297    {other}
136176         {}
Name: end_cluster, Length: 376203, dtype: object

## Обучение модели

LGBM

In [None]:
pipeline = Pipeline([
    ('scaler', MinMaxScaler()),
    ('model', LGBMClassifier(verbosity=-1, random_state=42, n_jobs=-1, learning_rate=0.0075, n_estimators=40))
])

## Подбор параметров (для разработки)

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = {
    'model__n_estimators': [50, 70, 90],
    # 'model__learning_rate': [0.01, 0.1, 0.5],
    'model__learning_rate': [0.15, 0.2]
    # 'model__learning_rate': [0.2]
}

# Обучаем модель с использованием GridSearchCV для выбора оптимальных параметров
grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='accuracy', return_train_score=True)
grid_search.fit(x_train, y_train)

# Оцениваем производительность модели на валидационном наборе данных
accuracy = grid_search.score(x_val, y_val)
print("Accuracy:", accuracy)

# Получаем лучшие параметры модели
best_params = grid_search.best_params_
print("Best parameters:", best_params)

Accuracy: 0.6944426093045994
Best parameters: {'model__learning_rate': 0.15, 'model__n_estimators': 50}


In [None]:
import matplotlib.pyplot as plt

from sklearn.model_selection import learning_curve

# Создание кривой обучения
train_sizes, train_scores, test_scores = learning_curve(
    model, X, y, cv=3, scoring='accuracy', train_sizes=[0.25, 0.5, 1])

# Вычисление средних значений и стандартных отклонений
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

# Построение графика
plt.plot(train_sizes, train_mean, 'o-', color='b', label='Ошибка на обучающей выборке')
plt.plot(train_sizes, test_mean, 'o-', color='r', label='Ошибка на валидационной выборке')
plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1, color='blue')
plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, alpha=0.1, color='red')
plt.xlabel('Размер обучающей выборки')
plt.ylabel('Оценка точности')
plt.legend(loc='best')
plt.show()

## Название не придумал, но тыкнуть надо

In [None]:
pipeline.fit(x_train, y_train)

Зададим функцию для взвешенной метрики roc auc

In [None]:
def weighted_roc_auc(y_true, y_pred, labels, weights_dict):
    unnorm_weights = np.array([weights_dict[label] for label in labels])
    weights = unnorm_weights / unnorm_weights.sum()
    classes_roc_auc = roc_auc_score(y_true, y_pred, labels=labels,
                                    multi_class="ovr", average=None)
    return sum(weights * classes_roc_auc)

In [None]:
cluster_weights = pd.read_excel("/content/drive/MyDrive/cluster_weights.xlsx").set_index("cluster")
weights_dict = cluster_weights["unnorm_weight"].to_dict()

## Проверка работы модели

In [None]:
y_pred_proba = pipeline.predict_proba(x_val)
y_pred_proba.shape

(161230, 17)

In [None]:
weighted_roc_auc(y_val, y_pred_proba, pipeline.classes_, weights_dict)

0.8787511937707377

MinMaxScaler 0.8767

RobustScaler 0.8765

StandartScaler 0.83

## Прогноз на тестовой выборке

In [None]:
test_df.pivot(index="id", columns="date", values="start_cluster").head(3)

date,month_4,month_5,month_6
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
200000,15,15,15
200001,15,15,15
200002,15,15,15


Для того, чтобы сделать прогноз на тестовой выборке, нужно заполнить стартовый кластер. </br>
В качестве базового подхода заполним все стартовые кластеры, самым популярным кластером.

In [None]:
test_df["start_cluster"] = train_df["start_cluster"].mode()[0]
test_df["start_cluster"] = test_df["start_cluster"].astype("category")

In [None]:
sample_submission_df = pd.read_csv("/content/drive/MyDrive/sample_submission.csv")

In [None]:
sample_submission_df.shape

(100000, 18)

In [None]:
sample_submission_df.head()

Unnamed: 0,id,{other},{},"{α, β}","{α, γ}","{α, δ}","{α, ε, η}","{α, ε, θ}","{α, ε, ψ}","{α, ε}","{α, η}","{α, θ}","{α, λ}","{α, μ}","{α, π}","{α, ψ}",{α},{λ}
0,200000,0.2,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05
1,200001,0.2,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05
2,200002,0.2,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05
3,200003,0.2,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05
4,200004,0.2,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05


Для тестовой выборки будем использовать только последний месяц

In [None]:
last_m_test_df = test_df[test_df["date"] == "month_6"]
last_m_test_df = last_m_test_df.drop(["id", "date"], axis=1)

Кодирование категориальных признаков (ECF)

In [None]:
enc = LabelEncoder()

le = {}
for col in cat_cols:
  le[col] = enc
  last_m_test_df[col] = le[col].fit_transform(last_m_test_df[col])

In [None]:
last_m_test_df.head()

Unnamed: 0,balance_amt_avg,balance_amt_max,balance_amt_min,balance_amt_day_avg,channel_code,city,city_type,index_city_code,ogrn_days_end_month,ogrn_days_end_quarter,...,sum_cred_g_oper_3m,cnt_cred_g_oper_3m,cnt_days_cred_g_oper_3m,sum_deb_h_oper_3m,cnt_deb_h_oper_3m,cnt_days_deb_h_oper_3m,sum_cred_h_oper_3m,cnt_cred_h_oper_3m,cnt_days_cred_h_oper_3m,start_cluster
2,0.045988,0.049418,-0.125995,0.047079,4,295,0,225,-1.533705,-1.683741,...,0.006812,0.945281,0.396267,-0.1528,0.549468,0.54102,0.387566,0.268543,0.836079,0
5,-0.156722,-0.20492,-0.125856,-0.156258,39,3087,0,225,0.092087,1.22003,...,-0.028584,,,-0.165588,,,-0.201123,,,0
8,-0.048015,0.448252,-0.125995,-0.047215,4,295,0,201,-1.069193,-1.528873,...,0.123154,0.94685,0.453739,2.61487,0.565087,0.818798,4.449125,0.258723,0.627287,0
11,-0.156579,-0.204813,-0.125501,-0.156115,37,1340,0,225,-0.256297,-1.257854,...,-0.028584,,,-0.165588,,,-0.201123,,,0
13,-0.153379,-0.201932,-0.125995,-0.154155,37,3523,2584,225,0.672727,1.413615,...,-0.027573,0.944889,0.396267,-0.165324,0.547032,0.418798,-0.201123,0.250924,0.37454,0


In [None]:
test_pred_proba = pipeline.predict_proba(last_m_test_df)
test_pred_proba_df = pd.DataFrame(test_pred_proba, columns=pipeline.classes_)
sorted_classes = sorted(test_pred_proba_df.columns.to_list())
test_pred_proba_df = test_pred_proba_df[sorted_classes]

In [None]:
test_pred_proba_df.shape

(100000, 17)

In [None]:
test_pred_proba_df.head(2)

Unnamed: 0,{other},{},"{α, β}","{α, γ}","{α, δ}","{α, ε, η}","{α, ε, θ}","{α, ε, ψ}","{α, ε}","{α, η}","{α, θ}","{α, λ}","{α, μ}","{α, π}","{α, ψ}",{α},{λ}
0,0.218887,0.227906,0.017898,0.086468,0.007229,0.002652,0.001681,0.000534,0.008118,0.053469,0.012046,0.001598,0.003271,2.8e-05,0.004751,0.35329,0.000173
1,0.158954,0.249483,0.017238,0.055822,0.006005,0.00238,0.001677,0.00065,0.008137,0.068728,0.013838,0.001288,0.002973,2.7e-05,0.013474,0.399161,0.000165


In [None]:
sample_submission_df[sorted_classes] = test_pred_proba_df
sample_submission_df.to_csv("npb.csv", index=False)

In [None]:
r = pd.read_csv('npb.csv')
r

Unnamed: 0,id,{other},{},"{α, β}","{α, γ}","{α, δ}","{α, ε, η}","{α, ε, θ}","{α, ε, ψ}","{α, ε}","{α, η}","{α, θ}","{α, λ}","{α, μ}","{α, π}","{α, ψ}",{α},{λ}
0,200000,0.218887,0.227906,0.017898,0.086468,0.007229,0.002652,0.001681,0.000534,0.008118,0.053469,0.012046,0.001598,0.003271,0.000028,0.004751,0.353290,0.000173
1,200001,0.158954,0.249483,0.017238,0.055822,0.006005,0.002380,0.001677,0.000650,0.008137,0.068728,0.013838,0.001288,0.002973,0.000027,0.013474,0.399161,0.000165
2,200002,0.298335,0.134823,0.017610,0.072215,0.007221,0.002893,0.003198,0.004342,0.017394,0.054190,0.011976,0.006573,0.003315,0.000028,0.007640,0.358055,0.000192
3,200003,0.157268,0.248451,0.017055,0.055229,0.005942,0.002355,0.001659,0.000643,0.008051,0.067999,0.013691,0.001274,0.002941,0.000027,0.022326,0.394926,0.000163
4,200004,0.172150,0.170667,0.018798,0.087130,0.006549,0.002595,0.001829,0.000709,0.008874,0.074948,0.016406,0.001405,0.003693,0.000036,0.006630,0.427402,0.000179
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,299995,0.156835,0.239622,0.017410,0.055077,0.005925,0.002348,0.001655,0.000636,0.008029,0.067812,0.034062,0.001446,0.002933,0.000135,0.011997,0.393838,0.000241
99996,299996,0.223006,0.155217,0.018403,0.077023,0.007075,0.002482,0.001749,0.000535,0.008487,0.055897,0.022450,0.001528,0.003532,0.000030,0.004966,0.417437,0.000184
99997,299997,0.269630,0.142153,0.020773,0.095523,0.007196,0.002511,0.001845,0.000642,0.009230,0.056547,0.012407,0.001487,0.003459,0.000030,0.005024,0.371358,0.000184
99998,299998,0.176413,0.171785,0.019264,0.062381,0.006711,0.002660,0.001874,0.000726,0.009094,0.076804,0.015464,0.001591,0.003785,0.000032,0.013250,0.437984,0.000184
