In [2]:
import pandas as pd
import numpy as np

import json
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.model_selection import train_test_split

import datetime as dt
import yaml
import os

In [3]:
# Подключение к диску, чтобы забрать исходный датасет
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
!pip install catboost
from catboost import CatBoostClassifier, Pool

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [5]:
dataset_path="/content/drive/MyDrive/Hacks/LeadersOfDigital NN 2024/data"

users = pd.read_csv(f'{dataset_path}/cntrbtrs_clnts_ops_trn.csv', delimiter=';', encoding='cp1251', dtype={'okato': str, 'pstl_code': str}, parse_dates=['accnt_bgn_date'])
# trunc = pd.read_csv(f'{dataset_path}/trnsctns_ops_trn.csv', delimiter=';', encoding='cp1251', parse_dates=['oprtn_date'])

In [6]:
# Классы обработчиков данных, которые являются полной копией из файла data_processors.py

class DataProcessorV1:
    """
    Исходный датасет, только клиенты
    """
    def __init__(self, path_to_configs, config_name):
        current_weights_path = os.path.join(path_to_configs, config_name)
        with open(current_weights_path, 'r') as file:
            self.config = yaml.safe_load(file)

    def process(self, transactions: pd.DataFrame, clients: pd.DataFrame):
        clients_columns_typing = self.config['data']['columns']['clients']
        clients_columns = list(self.config['data']['columns']['clients'].keys())

        clients = clients[clients_columns]

        clients['gndr'] = clients['gndr'].map({'ж': 0, 'м': 1})
        clients['accnt_bgn_date'] = clients['accnt_bgn_date'].astype(np.int64) // 1e9
        clients['accnt_status'] = clients['accnt_status'].map({'Накопительный период': 0, 'Выплатной период': 1})
        clients['okato'] = clients['okato'].fillna('0').apply(lambda x: str(x)[:2])
        clients['phn'] = clients['phn'].map({'нет': 0, 'да': 1})
        clients['email'] = clients['email'].map({'нет': 0, 'да': 1})
        clients['lk'] = clients['lk'].map({'нет': 0, 'да': 1})
        clients['assgn_npo'] = clients['assgn_npo'].map({'нет': 0, 'да': 1})
        clients['assgn_ops'] = clients['assgn_ops'].map({'нет': 0, 'да': 1})

        self.processed_data = clients.astype(
            clients_columns_typing
        )

        return self.processed_data

In [9]:
# В папку configs необходимо переместить конфиг обработчика соответствующей версии
# и дополнить data.columns.clients параметром 'erly_pnsn_flg': 'int'
os.mkdir("/content/configs")
os.mkdir("/content/weights")

In [15]:
# Создать экземпляр класса соответствующей версии обработчика
processor = DataProcessorV1("/content/configs", "processor_v1.yaml")
data = processor.process(_, users.copy())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clients['gndr'] = clients['gndr'].map({'ж': 0, 'м': 1})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clients['accnt_bgn_date'] = clients['accnt_bgn_date'].astype(np.int64) // 1e9
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clients['accnt_status'] = clients['accnt_status'].map({'Накопительный п

In [16]:
# Разделение датасета на обучающий и тестовый

target_column_name = 'erly_pnsn_flg'

X_data = data.drop([target_column_name], axis=1)
y_data = data[target_column_name]

X_train, X_test, y_train, y_test = train_test_split(
    X_data,
    y_data,
    test_size=0.05,
    random_state=42,
    shuffle=True,
    stratify=data[target_column_name])

In [17]:
# Универсальный класс обучения Catboost

class CatBoostTrainer:
    def __init__(self, iterations: int, depth: int):
        self.iterations = iterations
        self.depth = depth

        self.cbc = None

    def train(self, X_train, y_train):
        # categorical_features = train_data.dtypes[
        #     (train_data.dtypes != np.float64) & (train_data.dtypes != np.int64)].index.tolist()

        pooled_train = Pool(data=X_train.drop(['accnt_id'], axis=1),
                            label=y_train
                            # cat_features=categorical_features
                            )

        # pooled_test = Pool(data=X_test.drop(['accnt_id'], axis=1),
        #                     label=y_test
        #                     # cat_features=categorical_features
        #                     )

        self.cbc = CatBoostClassifier(iterations=self.iterations,
                                 depth=self.depth,
                                 random_seed=42,
                                #  task_type="GPU",
                                #  devices="0:1",
                                 loss_function='CrossEntropy',
                                 eval_metric="F1")

        print("Обучение...")
        self.cbc.fit(pooled_train,
                # eval_set=pooled_test,
                use_best_model=True,
                verbose=True)

    def save(self, weights_path: str):
        self.cbc.save_model(weights_path,
                format="cbm",
                export_parameters=None,
                pool=None)

In [18]:
# Обучение модели и сохранение весов

weights_path = 'weights/classifier_v1.cbm'

model = CatBoostTrainer(
    iterations=10,
    depth=10)

model.train(X_train, y_train)
model.save(weights_path)

Обучение...


You should provide test set for use best model. use_best_model parameter has been switched to false value.


0:	learn: 0.9958082	total: 403ms	remaining: 3.63s
1:	learn: 0.9958082	total: 584ms	remaining: 2.34s
2:	learn: 0.9958624	total: 844ms	remaining: 1.97s
3:	learn: 0.9944902	total: 1.14s	remaining: 1.71s
4:	learn: 0.9944628	total: 1.39s	remaining: 1.39s
5:	learn: 0.9933645	total: 1.63s	remaining: 1.08s
6:	learn: 0.9940237	total: 1.86s	remaining: 797ms
7:	learn: 0.9944354	total: 2.01s	remaining: 503ms
8:	learn: 0.9944628	total: 2.26s	remaining: 251ms
9:	learn: 0.9944902	total: 2.54s	remaining: 0us


In [19]:
# Универсальный класс предиктора

class CatBoostPredictor:
    def __init__(self, ):
        self.cbc = CatBoostClassifier()

        self.accnt_ids = None
        self.predictions = None
        self.submission = None

    def load(self, weights_path: str):
        self.cbc.load_model(weights_path, format='cbm')

    def predict(self, X: pd.DataFrame):
        self.accnt_ids = X[['accnt_id']].copy()

        pooled = Pool(data=X.drop(['accnt_id'], axis=1))

        self.predictions = self.cbc.predict(pooled)

        return self.predictions

    def build_submission(self, submission_path: str):
        self.submission = self.accnt_ids.copy()
        self.submission['erly_pnsn_flg'] = self.predictions

        self.submission = self.submission.reset_index(drop=True)
        self.submission.to_csv(submission_path, header=True)
        return self.submission

In [20]:
model = CatBoostPredictor()
model.load(weights_path)
predictions = model.predict(X_test)
# submission = model.build_submission('submission.csv')

In [21]:
f1_score(y_test, model.cbc.predict(X_test))

0.9963786859803414