## Импорт библиотек

In [1]:
!pip install catboost


Collecting catboost
  Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl (98.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.5


In [2]:
%load_ext autoreload
%autoreload 2

import gc

import os
import sys
import pandas as pd
import numpy as np
import tqdm
import seaborn as sns

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.inspection import permutation_importance
from sklearn.metrics import roc_auc_score

import catboost as cb
import lightgbm as lgb

pd.set_option('display.max_columns', None)

os.environ["CUDA_VISIBLE_DEVICES"] = '0'


## Config

In [3]:
TRAIN_TRANSACTIONS_PATH = '/content/drive/My Drive/Университет/Credit_Scoring/data/train/'
TEST_TRANSACTIONS_PATH = '/content/drive/My Drive/Университет/Credit_Scoring/data/test/'
TRAIN_TARGET_PATH = '/content/drive/My Drive/Университет/Credit_Scoring/data/train_target.csv'
TEST_TARGET_PATH = '/content/drive/My Drive/Университет/Credit_Scoring/data/test_target_contest.csv'
FEATURE_EXTRACTION_PATH_TRAIN = '/content/drive/My Drive/Университет/Credit_Scoring/data/feature_extraction/train/'
FEATURE_EXTRACTION_PATH_TEST = '/content/drive/My Drive/Университет/Credit_Scoring/data/feature_extraction/test/'


##  Извлечение признаков

In [4]:
class Feature:

    def read_parquet_dataset_from_local(path_to_dataset: str, start_from: int = 0,
                                        num_parts_to_read: int = 2, columns=None, verbose=False) -> pd.DataFrame:
        """
        читает num_parts_to_read партиций, преобразует их к pd.DataFrame и возвращает
        :param path_to_dataset: путь до директории с партициями
        :param start_from: номер партиции, с которой начать чтение
        :param num_parts_to_read: количество партиций, которые требуется прочитать
        :param columns: список колонок, которые нужно прочитать из партиции
        :return: pd.DataFrame
        """

        res = []
        dataset_paths = sorted([os.path.join(path_to_dataset, filename) for filename in os.listdir(path_to_dataset)
                                  if filename.startswith('part')])  # Получаем список всех файлов, начинающихся с 'part'

        start_from = max(0, start_from)  # Убеждаемся, что стартовая партиция не меньше нуля
        chunks = dataset_paths[start_from: start_from + num_parts_to_read]  # Выбираем нужное количество партиций для чтения
        if verbose:
            print('Reading chunks:\n')  # Если verbose=True, выводим на печать имена партиций
            for chunk in chunks:
                print(chunk)
        for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):  # Читаем партиции с отображением прогресса
            chunk = pd.read_parquet(chunk_path, columns=columns)  # Читаем текущую партицию, возможно, выбирая определенные колонки
            res.append(chunk)  # Добавляем прочитанную партицию в список
        return pd.concat(res).reset_index(drop=True)  # Объединяем все партиции в один DataFrame и сбрасываем индексы

        # константны для признаков танзакции
        CAT_COLUMNS = ['currency', 'operation_kind', 'card_type',
                      'operation_type', 'operation_type_group', 'ecommerce_flag',
                      'payment_system', 'income_flag', 'mcc', 'country', 'city',
                      'mcc_category', 'day_of_week', 'hour','weekofyear']

        NUMERIC_COLUMNS = ['days_before', 'hour_diff']

        REAL_COLUMNS = ['amnt']


    def __amnt_pivot_table_by_column_as_frame(frame, column, agg_funcs=None) -> pd.DataFrame:
        """
        Строит pivot table для между колонкой `amnt`  и column на основе переданных aggregations_on
        :param frame: pd.DataFrame транзакций
        :param column: название колонки, на основе `amnt`  и column будет построен pivot_table
        :param agg_funcs: список из функций, которые нужно применить, по умолчанию ['mean', 'count']
        :return: pd.DataFrame
        """
        if agg_funcs is None:
            agg_funcs = ['mean', 'count']
        aggs = pd.pivot_table(frame, values='amnt',
                              index=['app_id'], columns=[column],
                              aggfunc={'amnt': agg_funcs},
                              fill_value=0.0)
        aggs.columns = [f'{col[0]}_{column}_{col[1]}' for col in aggs.columns.values]
        return aggs


    def extract_basic_aggregations(transactions_frame: pd.DataFrame, cat_columns=None, agg_funcs=None) -> pd.DataFrame:
        """
        :param transactions_frame: pd.DataFrame с транзакциями
        :param cat_columns: список категориальных переменных, для которых будут построены агрегаты по `amnt`
        :param agg_funcs: список функций, который нужно применить для подсчета агрегатов, по умолчанию
        ['sum', 'mean', 'count']
        :return: pd.DataFrame с извлеченными признаками
        """
        if not cat_columns:
            cat_columns = Feature.CAT_COLUMNS

        pivot_tables = []
        for col in cat_columns:
            pivot_tables.append(Feature.__amnt_pivot_table_by_column_as_frame(transactions_frame, column=col,
                                                                      agg_funcs=agg_funcs))
        pivot_tables = pd.concat(pivot_tables, axis=1)

        aggs = {
            # посчитаем статистики для транзакций
            'amnt': ['mean', 'median', 'sum', 'std'],
            # посчитаем разумные агрегаты для разницы в часах между транзакциями
            'hour_diff': ['max', 'mean', 'median', 'var', 'std'],
            # добавим самую раннюю/позднюю и среднюю дату транзакции до подачи заявки на кредит
            'days_before': ['min', 'max', 'median']}

        numeric_stats = transactions_frame.groupby(['app_id']).agg(aggs)

        # дадим разумные имена новым колонкам;
        numeric_stats.columns = [k + '_' + agg for k in aggs.keys() for agg in aggs[k]]

        return pd.concat([pivot_tables, numeric_stats], axis=1).reset_index()


    def prepare_transactions_dataset(path_to_dataset: str, num_parts_to_preprocess_at_once: int = 1, num_parts_total: int=50,
                              save_to_path=None, verbose: bool=False):
        """
        возвращает готовый pd.DataFrame с признаками, на которых можно учить модель для целевой задачи.
        path_to_dataset: str
            путь до датасета с партициями
        num_parts_to_preprocess_at_once: int
            количество партиций, которые будут одновременно держаться в памяти и обрабатываться
        num_parts_total: int
            общее количество партиций, которые нужно обработать
        save_to_path: str
            путь до папки, в которой будет сохранен каждый обработанный блок в .parquet формате. Если None, то не будет сохранен
        verbose: bool
            логирует каждый обрабатываемый кусок данных
        """
        preprocessed_frames = []
        block = 0
        for step in tqdm.tqdm_notebook(range(0, num_parts_total, num_parts_to_preprocess_at_once),
                                      desc="Transforming transactions data"):
            transactions_frame = Feature.read_parquet_dataset_from_local(path_to_dataset, step, num_parts_to_preprocess_at_once,
                                                                verbose=verbose)
            features = Feature.extract_basic_aggregations(transactions_frame,
                                                  cat_columns=['mcc_category', 'day_of_week', 'operation_type'])
            if save_to_path:
                block_as_str = str(block)
                if len(block_as_str) == 1:
                    block_as_str = '00' + block_as_str
                else:
                    block_as_str = '0' + block_as_str
                features.to_parquet(os.path.join(save_to_path, f'processed_chunk_{block_as_str}.parquet'))

            preprocessed_frames.append(features)
        return pd.concat(preprocessed_frames)


In [5]:
# Извлечём признаки из тренировочного дата-сета
data = Feature.prepare_transactions_dataset(TRAIN_TRANSACTIONS_PATH, num_parts_to_preprocess_at_once=2, num_parts_total=50,
                                    save_to_path=FEATURE_EXTRACTION_PATH_TRAIN)


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for step in tqdm.tqdm_notebook(range(0, num_parts_total, num_parts_to_preprocess_at_once),


Transforming transactions data:   0%|          | 0/25 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):  # Читаем партиции с отображением прогресса


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):  # Читаем партиции с отображением прогресса


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):  # Читаем партиции с отображением прогресса


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):  # Читаем партиции с отображением прогресса


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):  # Читаем партиции с отображением прогресса


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):  # Читаем партиции с отображением прогресса


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):  # Читаем партиции с отображением прогресса


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):  # Читаем партиции с отображением прогресса


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):  # Читаем партиции с отображением прогресса


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):  # Читаем партиции с отображением прогресса


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):  # Читаем партиции с отображением прогресса


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):  # Читаем партиции с отображением прогресса


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):  # Читаем партиции с отображением прогресса


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):  # Читаем партиции с отображением прогресса


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):  # Читаем партиции с отображением прогресса


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):  # Читаем партиции с отображением прогресса


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):  # Читаем партиции с отображением прогресса


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):  # Читаем партиции с отображением прогресса


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):  # Читаем партиции с отображением прогресса


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):  # Читаем партиции с отображением прогресса


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):  # Читаем партиции с отображением прогресса


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):  # Читаем партиции с отображением прогресса


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):  # Читаем партиции с отображением прогресса


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):  # Читаем партиции с отображением прогресса


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):  # Читаем партиции с отображением прогресса


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
print(f'Объем в RAM всего датасета: {round(data.memory_usage(index=True).sum() / 10**9, 3)} Gb')


Объем в RAM всего датасета: 0.975 Gb


In [7]:
# Извлечём признаки из тестового дата-сета
test_data = Feature.prepare_transactions_dataset(TEST_TRANSACTIONS_PATH, num_parts_to_preprocess_at_once=3, num_parts_total=50,
                                         save_to_path=FEATURE_EXTRACTION_PATH_TEST)


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for step in tqdm.tqdm_notebook(range(0, num_parts_total, num_parts_to_preprocess_at_once),


Transforming transactions data:   0%|          | 0/17 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):  # Читаем партиции с отображением прогресса


Reading dataset with pandas:   0%|          | 0/3 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):  # Читаем партиции с отображением прогресса


Reading dataset with pandas:   0%|          | 0/3 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):  # Читаем партиции с отображением прогресса


Reading dataset with pandas:   0%|          | 0/3 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):  # Читаем партиции с отображением прогресса


Reading dataset with pandas:   0%|          | 0/3 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):  # Читаем партиции с отображением прогресса


Reading dataset with pandas:   0%|          | 0/3 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):  # Читаем партиции с отображением прогресса


Reading dataset with pandas:   0%|          | 0/3 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):  # Читаем партиции с отображением прогресса


Reading dataset with pandas:   0%|          | 0/3 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):  # Читаем партиции с отображением прогресса


Reading dataset with pandas:   0%|          | 0/3 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):  # Читаем партиции с отображением прогресса


Reading dataset with pandas:   0%|          | 0/3 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):  # Читаем партиции с отображением прогресса


Reading dataset with pandas:   0%|          | 0/3 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):  # Читаем партиции с отображением прогресса


Reading dataset with pandas:   0%|          | 0/3 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):  # Читаем партиции с отображением прогресса


Reading dataset with pandas:   0%|          | 0/3 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):  # Читаем партиции с отображением прогресса


Reading dataset with pandas:   0%|          | 0/3 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):  # Читаем партиции с отображением прогресса


Reading dataset with pandas:   0%|          | 0/3 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):  # Читаем партиции с отображением прогресса


Reading dataset with pandas:   0%|          | 0/3 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):  # Читаем партиции с отображением прогресса


Reading dataset with pandas:   0%|          | 0/3 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):  # Читаем партиции с отображением прогресса


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

## Загрузка и объединение данных

In [8]:
test_target =  pd.read_csv(TEST_TARGET_PATH)
test_target.head()


Unnamed: 0,app_id,product
0,1063620,0
1,1063621,0
2,1063622,1
3,1063623,1
4,1063624,2


In [9]:
merged_test_data = test_data.merge(test_target[['app_id', 'product']], on='app_id')
merged_test_data


Unnamed: 0,app_id,count_mcc_category_1,count_mcc_category_2,count_mcc_category_3,count_mcc_category_4,count_mcc_category_5,count_mcc_category_6,count_mcc_category_7,count_mcc_category_8,count_mcc_category_9,count_mcc_category_10,count_mcc_category_11,count_mcc_category_12,count_mcc_category_13,count_mcc_category_14,count_mcc_category_15,count_mcc_category_16,count_mcc_category_17,count_mcc_category_18,count_mcc_category_19,count_mcc_category_20,count_mcc_category_21,count_mcc_category_22,count_mcc_category_23,count_mcc_category_24,count_mcc_category_25,count_mcc_category_26,count_mcc_category_27,count_mcc_category_28,mean_mcc_category_1,mean_mcc_category_2,mean_mcc_category_3,mean_mcc_category_4,mean_mcc_category_5,mean_mcc_category_6,mean_mcc_category_7,mean_mcc_category_8,mean_mcc_category_9,mean_mcc_category_10,mean_mcc_category_11,mean_mcc_category_12,mean_mcc_category_13,mean_mcc_category_14,mean_mcc_category_15,mean_mcc_category_16,mean_mcc_category_17,mean_mcc_category_18,mean_mcc_category_19,mean_mcc_category_20,mean_mcc_category_21,mean_mcc_category_22,mean_mcc_category_23,mean_mcc_category_24,mean_mcc_category_25,mean_mcc_category_26,mean_mcc_category_27,mean_mcc_category_28,count_day_of_week_1,count_day_of_week_2,count_day_of_week_3,count_day_of_week_4,count_day_of_week_5,count_day_of_week_6,count_day_of_week_7,mean_day_of_week_1,mean_day_of_week_2,mean_day_of_week_3,mean_day_of_week_4,mean_day_of_week_5,mean_day_of_week_6,mean_day_of_week_7,count_operation_type_1,count_operation_type_2,count_operation_type_3,count_operation_type_4,count_operation_type_5,count_operation_type_6,count_operation_type_7,count_operation_type_8,count_operation_type_9,count_operation_type_10,count_operation_type_11,count_operation_type_12,count_operation_type_13,count_operation_type_14,count_operation_type_15,count_operation_type_16,count_operation_type_17,count_operation_type_18,count_operation_type_19,count_operation_type_20,count_operation_type_21,count_operation_type_22,mean_operation_type_1,mean_operation_type_2,mean_operation_type_3,mean_operation_type_4,mean_operation_type_5,mean_operation_type_6,mean_operation_type_7,mean_operation_type_8,mean_operation_type_9,mean_operation_type_10,mean_operation_type_11,mean_operation_type_12,mean_operation_type_13,mean_operation_type_14,mean_operation_type_15,mean_operation_type_16,mean_operation_type_17,mean_operation_type_18,mean_operation_type_19,mean_operation_type_20,mean_operation_type_21,mean_operation_type_22,amnt_mean,amnt_median,amnt_sum,amnt_std,hour_diff_max,hour_diff_mean,hour_diff_median,hour_diff_var,hour_diff_std,days_before_min,days_before_max,days_before_median,product
0,1063620,67,87,224,79,12,72,20,14,44,1,3,3,9,0,13,9,6,1,8,0,0,0,0,0,0,1,0,0,0.374893,0.450460,0.232218,0.339087,0.374568,0.304501,0.356436,0.410680,0.413262,0.464859,0.404743,0.381515,0.441680,0.000000,0.442177,0.448111,0.421683,0.429013,0.368924,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.579884,0.0,0.0,106,87,105,87,92,87,109,0.334436,0.331190,0.335415,0.327747,0.361581,0.305618,0.328561,0,549,3,29,26,26,0,8,22,0,9,0,1,0,0,0,0,0,0,0,0,0,0.000000,0.310710,0.508202,0.529622,0.432579,0.361804,0.000000,0.459533,0.351000,0.000000,0.413964,0.000000,0.422172,0.0,0.0,0.000000,0.0,0.000000,0.000000,0,0.0,0.000000,0.332338,0.354278,223.663752,0.143467,238,12.716196,4.0,558.742252,23.637729,1,359,108.0,0
1,1063621,78,37,1,1,6,32,6,46,3,0,8,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.333128,0.413989,0.349947,0.345966,0.334767,0.268163,0.358662,0.270007,0.434815,0.000000,0.263400,0.363534,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,33,35,25,39,25,45,17,0.347747,0.316946,0.349675,0.299930,0.353718,0.298355,0.332825,0,26,2,4,25,15,0,14,0,0,0,125,4,0,0,0,0,3,0,0,0,1,0.000000,0.350639,0.463622,0.534908,0.395807,0.355931,0.000000,0.469286,0.000000,0.000000,0.000000,0.278351,0.286057,0.0,0.0,0.000000,0.0,0.336356,0.000000,0,0.0,0.000000,0.323903,0.324725,70.934822,0.134166,695,37.945205,13.0,6849.685057,82.762824,7,353,130.0,0
2,1063622,394,70,128,25,36,176,23,3,15,3,1,0,0,6,12,1,0,1,0,0,0,0,0,1,0,0,0,0,0.278751,0.403717,0.279859,0.290066,0.327409,0.246515,0.267015,0.446753,0.342263,0.344876,0.357151,0.000000,0.000000,0.300321,0.328686,0.246590,0.000000,0.546010,0.000000,0.0,0.000000,0.000000,0.000000,0.436826,0.0,0.000000,0.0,0.0,133,145,138,104,130,115,130,0.291878,0.292458,0.281156,0.286131,0.291214,0.283205,0.285445,823,0,48,2,0,0,20,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0.277378,0.000000,0.411560,0.432811,0.000000,0.000000,0.381985,0.000000,0.000000,0.387677,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0,0.0,0.000000,0.287506,0.288515,257.317699,0.069607,289,9.538547,0.0,855.219709,29.244140,1,359,136.0,1
3,1063623,231,143,54,39,3,18,32,30,17,5,35,5,2,0,2,5,1,0,1,0,2,5,0,0,0,0,0,0,0.372485,0.477430,0.348056,0.348874,0.381429,0.382486,0.359761,0.414609,0.458773,0.325058,0.318315,0.357156,0.554295,0.000000,0.391238,0.423332,0.365149,0.000000,0.406542,0.0,0.446096,0.386010,0.000000,0.000000,0.0,0.000000,0.0,0.0,118,68,117,94,86,99,48,0.405883,0.400567,0.395004,0.389794,0.390732,0.385172,0.394298,441,1,121,7,28,0,15,0,13,1,0,0,2,0,0,1,0,0,0,0,0,0,0.366401,0.447771,0.484072,0.496986,0.411189,0.000000,0.414732,0.000000,0.381877,0.431890,0.000000,0.000000,0.554295,0.0,0.0,0.328872,0.0,0.000000,0.000000,0,0.0,0.000000,0.394683,0.389334,248.650119,0.076571,202,13.538095,5.0,581.037497,24.104719,1,357,201.5,1
4,1063624,31,55,1,17,1,0,8,0,2,0,10,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0.283374,0.431195,0.151959,0.312128,0.375170,0.000000,0.299778,0.000000,0.369274,0.000000,0.310530,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.297589,0.510860,0.0,0.000000,0.0,0.0,36,14,14,14,12,24,13,0.365195,0.312167,0.374987,0.381223,0.367486,0.358536,0.328091,0,63,26,8,0,0,0,11,0,0,8,0,0,0,0,9,0,0,0,0,0,2,0.000000,0.298269,0.480358,0.288790,0.000000,0.000000,0.000000,0.446230,0.000000,0.000000,0.365470,0.000000,0.000000,0.0,0.0,0.319717,0.0,0.000000,0.000000,0,0.0,0.541919,0.357356,0.332807,45.384167,0.101596,501,65.173228,7.0,12905.033246,113.600322,14,359,227.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
502711,1580438,99,51,36,70,25,22,22,6,13,8,1,2,7,68,3,0,1,4,0,0,2,0,0,1,0,0,0,0,0.341001,0.493643,0.289776,0.335574,0.382392,0.375003,0.334023,0.526157,0.447533,0.381469,0.504243,0.462819,0.432937,0.430976,0.390865,0.000000,0.359050,0.406780,0.000000,0.0,0.476930,0.000000,0.000000,0.268175,0.0,0.000000,0.0,0.0,71,67,44,71,54,57,77,0.390231,0.380728,0.397137,0.395734,0.346802,0.397642,0.363771,382,0,17,34,0,4,0,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0.363621,0.000000,0.411518,0.534705,0.000000,0.520419,0.000000,0.000000,0.476930,0.537631,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0,0.0,0.000000,0.381382,0.368323,168.189620,0.091406,412,19.376417,4.0,1680.289806,40.991338,1,357,215.0,3
502712,1580439,188,125,493,387,13,53,61,6,30,3,10,104,8,0,33,26,2,2,0,0,7,2,0,2,0,2,0,0,0.283793,0.422318,0.237165,0.282812,0.324821,0.329595,0.365371,0.360802,0.414299,0.336434,0.291310,0.349408,0.377725,0.000000,0.365876,0.347559,0.465709,0.491463,0.000000,0.0,0.398184,0.429963,0.000000,0.217863,0.0,0.556838,0.0,0.0,268,178,241,223,259,237,151,0.295540,0.295225,0.291926,0.278672,0.287454,0.316509,0.323672,8,1399,83,12,12,0,0,1,33,0,6,0,1,0,0,0,0,0,2,0,0,0,0.349120,0.286879,0.429450,0.484541,0.318863,0.000000,0.000000,0.565487,0.289678,0.000000,0.337117,0.000000,0.392765,0.0,0.0,0.000000,0.0,0.000000,0.314473,0,0.0,0.000000,0.297104,0.310055,462.590515,0.115851,141,5.506101,3.0,84.517476,9.193339,1,359,188.0,1
502713,1580440,0,105,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.000000,0.365184,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,21,5,17,15,16,14,17,0.348992,0.302938,0.353302,0.412486,0.389137,0.333919,0.376841,0,0,46,31,0,0,0,0,0,0,0,0,0,0,0,0,0,28,0,0,0,0,0.000000,0.000000,0.503873,0.489231,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0,0.0,0.000000,0.365184,0.500693,38.344290,0.231980,361,77.104762,31.0,10035.075458,100.175224,17,354,167.0,0
502714,1580441,29,25,23,11,20,1,2,0,0,5,2,0,1,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.319660,0.506010,0.285336,0.295628,0.342952,0.220630,0.373185,0.000000,0.000000,0.324953,0.306111,0.000000,0.276893,0.457785,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,20,8,22,21,21,17,13,0.352167,0.364534,0.349893,0.365597,0.342300,0.382183,0.340622,58,39,8,17,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.310481,0.327952,0.460792,0.527288,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0,0.0,0.000000,0.356133,0.331558,43.448269,0.096843,42,7.639344,3.0,89.323398,9.451106,1,40,20.0,0


In [10]:
targets = pd.read_csv(TRAIN_TARGET_PATH)
targets.head()


Unnamed: 0,app_id,product,flag
0,0,3,0
1,1,1,0
2,2,1,0
3,3,1,0
4,4,1,0


In [11]:
merged_data = data.merge(targets[['app_id', 'product', 'flag']], on=['app_id'])
merged_data


Unnamed: 0,app_id,count_mcc_category_1,count_mcc_category_2,count_mcc_category_3,count_mcc_category_4,count_mcc_category_5,count_mcc_category_6,count_mcc_category_7,count_mcc_category_8,count_mcc_category_9,count_mcc_category_10,count_mcc_category_11,count_mcc_category_12,count_mcc_category_13,count_mcc_category_14,count_mcc_category_15,count_mcc_category_16,count_mcc_category_17,count_mcc_category_18,count_mcc_category_19,count_mcc_category_20,count_mcc_category_21,count_mcc_category_22,count_mcc_category_23,count_mcc_category_24,count_mcc_category_25,count_mcc_category_26,count_mcc_category_27,count_mcc_category_28,mean_mcc_category_1,mean_mcc_category_2,mean_mcc_category_3,mean_mcc_category_4,mean_mcc_category_5,mean_mcc_category_6,mean_mcc_category_7,mean_mcc_category_8,mean_mcc_category_9,mean_mcc_category_10,mean_mcc_category_11,mean_mcc_category_12,mean_mcc_category_13,mean_mcc_category_14,mean_mcc_category_15,mean_mcc_category_16,mean_mcc_category_17,mean_mcc_category_18,mean_mcc_category_19,mean_mcc_category_20,mean_mcc_category_21,mean_mcc_category_22,mean_mcc_category_23,mean_mcc_category_24,mean_mcc_category_25,mean_mcc_category_26,mean_mcc_category_27,mean_mcc_category_28,count_day_of_week_1,count_day_of_week_2,count_day_of_week_3,count_day_of_week_4,count_day_of_week_5,count_day_of_week_6,count_day_of_week_7,mean_day_of_week_1,mean_day_of_week_2,mean_day_of_week_3,mean_day_of_week_4,mean_day_of_week_5,mean_day_of_week_6,mean_day_of_week_7,count_operation_type_1,count_operation_type_2,count_operation_type_3,count_operation_type_4,count_operation_type_5,count_operation_type_6,count_operation_type_7,count_operation_type_8,count_operation_type_9,count_operation_type_10,count_operation_type_11,count_operation_type_13,count_operation_type_14,count_operation_type_15,count_operation_type_16,count_operation_type_17,count_operation_type_19,count_operation_type_20,count_operation_type_21,count_operation_type_22,mean_operation_type_1,mean_operation_type_2,mean_operation_type_3,mean_operation_type_4,mean_operation_type_5,mean_operation_type_6,mean_operation_type_7,mean_operation_type_8,mean_operation_type_9,mean_operation_type_10,mean_operation_type_11,mean_operation_type_13,mean_operation_type_14,mean_operation_type_15,mean_operation_type_16,mean_operation_type_17,mean_operation_type_19,mean_operation_type_20,mean_operation_type_21,mean_operation_type_22,amnt_mean,amnt_median,amnt_sum,amnt_std,hour_diff_max,hour_diff_mean,hour_diff_median,hour_diff_var,hour_diff_std,days_before_min,days_before_max,days_before_median,count_operation_type_12,count_operation_type_18,mean_operation_type_12,mean_operation_type_18,product,flag
0,0,38,109,1,1,0,0,11,4,6,0,4,0,1,3,0,1,0,0,0,0,2,0,0,0,0,0,0,0,0.314338,0.411590,0.282643,0.258414,0.000000,0.000000,0.358983,0.476752,0.417680,0.000000,0.336135,0.000000,0.450299,0.358530,0.000000,0.348838,0.000000,0.000000,0.000000,0.000000,0.526191,0.000000,0.0,0.00000,0.0,0.0,0.0,0.0,18,28,20,31,29,31,24,0.407102,0.367082,0.405063,0.370230,0.389952,0.397770,0.381615,5,58,70,10,14,1,12,2,1,4,0,3,0,0,1,0,0,0,0,0,0.391478,0.334183,0.423406,0.450615,0.400463,0.426544,0.325863,0.387677,0.527853,0.476752,0.000000,0.341430,0.0,0.000000,0.320248,0.0,0.0,0,0.000000,0.0,0.386645,0.387677,69.982751,0.094032,555,44.613260,20.0,5505.749601,74.200739,14,351,140.0,,,,,3,0
1,1,82,98,21,25,22,7,20,0,4,3,53,3,0,10,5,0,1,2,0,0,0,0,0,0,0,0,0,0,0.308654,0.402700,0.324518,0.290052,0.368639,0.283080,0.288406,0.000000,0.415924,0.372053,0.275840,0.325762,0.000000,0.370235,0.315308,0.000000,0.406542,0.311260,0.000000,0.000000,0.000000,0.000000,0.0,0.00000,0.0,0.0,0.0,0.0,41,46,77,65,44,48,35,0.342776,0.340613,0.346442,0.330076,0.315696,0.322229,0.347832,0,253,83,9,0,0,0,6,0,0,0,5,0,0,0,0,0,0,0,0,0.000000,0.310619,0.400795,0.404996,0.000000,0.000000,0.000000,0.425604,0.000000,0.000000,0.000000,0.266696,0.0,0.000000,0.000000,0.0,0.0,0,0.000000,0.0,0.335351,0.336345,119.384783,0.078044,517,24.053371,14.0,2624.771791,51.232527,1,358,177.0,,,,,1,0
2,2,87,66,2,1,15,4,0,3,0,0,0,0,0,1,0,50,0,0,0,0,0,0,0,0,0,0,0,0,0.297326,0.324500,0.283320,0.352522,0.299199,0.310019,0.000000,0.376802,0.000000,0.000000,0.000000,0.000000,0.000000,0.387677,0.000000,0.292975,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.00000,0.0,0.0,0.0,0.0,44,22,25,35,34,35,34,0.303431,0.262959,0.289129,0.329806,0.316246,0.307845,0.313648,102,2,2,0,7,5,24,0,55,3,0,0,0,28,0,0,0,1,0,0,0.296406,0.350680,0.480618,0.000000,0.404769,0.349589,0.416578,0.000000,0.302113,0.376802,0.000000,0.000000,0.0,0.209878,0.000000,0.0,0.0,0,0.000000,0.0,0.306107,0.310611,70.098407,0.087890,875,36.655022,13.0,7120.183100,84.381177,2,351,198.0,,,,,1,0
3,3,12,17,20,6,1,1,6,0,0,0,0,0,0,0,3,0,0,0,0,1,0,0,0,0,0,0,0,0,0.273827,0.423153,0.269708,0.338280,0.307306,0.427379,0.319819,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.420283,0.000000,0.000000,0.000000,0.000000,0.346315,0.000000,0.000000,0.0,0.00000,0.0,0.0,0.0,0.0,16,12,5,11,8,8,7,0.310740,0.360748,0.287993,0.305493,0.320624,0.305813,0.435919,16,27,5,0,3,1,0,7,7,0,1,0,0,0,0,0,0,0,0,0,0.236233,0.342831,0.392674,0.000000,0.361776,0.313312,0.000000,0.491988,0.276339,0.000000,0.387677,0.000000,0.0,0.000000,0.000000,0.0,0.0,0,0.000000,0.0,0.330808,0.298968,22.164130,0.098987,1446,109.104478,4.0,81713.761646,285.856190,20,325,144.0,,,,,1,0
4,4,5,86,0,0,0,0,0,1,0,0,25,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.311167,0.443438,0.000000,0.000000,0.000000,0.000000,0.000000,0.449287,0.000000,0.000000,0.323938,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.00000,0.0,0.0,0.0,0.0,24,9,15,18,18,20,13,0.451313,0.479648,0.402809,0.366100,0.422217,0.358765,0.437209,32,0,2,0,52,17,13,0,0,1,0,0,0,0,0,0,0,0,0,0,0.301696,0.000000,0.432836,0.000000,0.493529,0.399371,0.370550,0.000000,0.000000,0.449287,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0,0.000000,0.0,0.412301,0.387677,48.239197,0.128579,360,71.837607,35.0,8030.413056,89.612572,1,351,212.0,,,,,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
963806,1003045,4,20,0,0,0,0,0,28,0,0,1,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0.384463,0.454561,0.000000,0.000000,0.000000,0.000000,0.000000,0.469708,0.000000,0.000000,0.363362,0.000000,0.000000,0.000000,0.000000,0.000000,0.340207,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.00000,0.0,0.0,0.0,0.0,12,7,7,4,7,11,7,0.456380,0.465845,0.439046,0.421405,0.470465,0.446175,0.446726,7,0,3,2,17,0,0,0,0,16,10,0,0,0,0,0,0,0,0,0,0.368804,0.000000,0.534121,0.430585,0.463713,0.000000,0.000000,0.000000,0.000000,0.472733,0.433265,0.000000,0.0,0.000000,0.000000,0.0,0.0,0,0.000000,0.0,0.451358,0.433150,24.824672,0.076019,843,146.400000,25.0,57078.244444,238.910536,8,344,219.0,0.0,0.0,0.0,0.0,1,0
963807,1003047,43,0,103,14,4,16,15,32,12,6,5,0,0,0,1,1,1,4,0,0,0,2,0,1,0,0,0,0,0.430489,0.000000,0.413797,0.422825,0.502766,0.259138,0.453856,0.590336,0.514821,0.431574,0.456269,0.000000,0.000000,0.000000,0.436770,0.393020,0.344412,0.435717,0.000000,0.000000,0.000000,0.493504,0.0,0.65266,0.0,0.0,0.0,0.0,39,11,33,62,51,57,7,0.424596,0.393294,0.451117,0.446991,0.448959,0.446893,0.379614,225,3,0,0,3,0,0,0,0,29,0,0,0,0,0,0,0,0,0,0,0.420214,0.357987,0.000000,0.000000,0.616937,0.000000,0.000000,0.000000,0.000000,0.587585,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0,0.000000,0.0,0.440434,0.439676,114.512897,0.099036,370,32.896154,5.0,3399.730487,58.307208,2,359,183.0,0.0,0.0,0.0,0.0,0,0
963808,1003048,38,45,15,15,29,23,8,10,12,6,1,6,1,2,0,0,3,1,1,0,3,0,0,0,0,0,0,0,0.362814,0.546826,0.321298,0.373055,0.388160,0.338561,0.413243,0.492214,0.485768,0.362616,0.387677,0.386369,0.548934,0.377084,0.000000,0.000000,0.463659,0.555304,0.320061,0.000000,0.592347,0.000000,0.0,0.00000,0.0,0.0,0.0,0.0,33,37,14,22,27,32,54,0.454423,0.431430,0.386160,0.368907,0.432917,0.448991,0.400014,36,130,10,6,26,0,0,10,0,0,0,0,0,0,0,0,0,0,1,0,0.384881,0.384462,0.641382,0.611635,0.495293,0.000000,0.000000,0.484212,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0,0.499168,0.0,0.420723,0.403840,92.138302,0.103740,1245,39.246575,7.0,14031.149931,118.453155,1,359,92.0,0.0,0.0,0.0,0.0,1,0
963809,1003049,33,27,25,1,3,4,9,0,2,2,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.324305,0.448587,0.262239,0.398503,0.278997,0.407844,0.255197,0.000000,0.388313,0.245606,0.283726,0.000000,0.000000,0.348836,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.00000,0.0,0.0,0.0,0.0,19,15,13,13,10,18,21,0.354750,0.313026,0.419404,0.325441,0.359286,0.313012,0.303425,0,82,1,0,0,0,1,23,0,0,2,0,0,0,0,0,0,0,0,0,0.000000,0.300071,0.500693,0.000000,0.000000,0.000000,0.510295,0.446701,0.000000,0.000000,0.413369,0.000000,0.0,0.000000,0.000000,0.0,0.0,0,0.000000,0.0,0.336859,0.317085,36.717646,0.090648,702,71.165138,3.0,20865.602107,144.449306,29,352,194.0,0.0,0.0,0.0,0.0,0,0


In [12]:
features = [x for x in merged_data.columns if x not in ['app_id', 'flag']]
features


['count_mcc_category_1',
 'count_mcc_category_2',
 'count_mcc_category_3',
 'count_mcc_category_4',
 'count_mcc_category_5',
 'count_mcc_category_6',
 'count_mcc_category_7',
 'count_mcc_category_8',
 'count_mcc_category_9',
 'count_mcc_category_10',
 'count_mcc_category_11',
 'count_mcc_category_12',
 'count_mcc_category_13',
 'count_mcc_category_14',
 'count_mcc_category_15',
 'count_mcc_category_16',
 'count_mcc_category_17',
 'count_mcc_category_18',
 'count_mcc_category_19',
 'count_mcc_category_20',
 'count_mcc_category_21',
 'count_mcc_category_22',
 'count_mcc_category_23',
 'count_mcc_category_24',
 'count_mcc_category_25',
 'count_mcc_category_26',
 'count_mcc_category_27',
 'count_mcc_category_28',
 'mean_mcc_category_1',
 'mean_mcc_category_2',
 'mean_mcc_category_3',
 'mean_mcc_category_4',
 'mean_mcc_category_5',
 'mean_mcc_category_6',
 'mean_mcc_category_7',
 'mean_mcc_category_8',
 'mean_mcc_category_9',
 'mean_mcc_category_10',
 'mean_mcc_category_11',
 'mean_mcc_cate

## Обучение модели LightGBM на CPU

In [38]:
targets = merged_data.flag.values

cv = KFold(n_splits=5, random_state=100, shuffle=True)

oof = np.zeros(len(merged_data))
train_preds = np.zeros(len(merged_data))

models = []

tree_params = {
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.05,
    'max_depth': 3,
    'reg_lambda': 1,
    'num_leaves': 64,
    'n_jobs': 5,
    'n_estimators': 1_000
}

for fold_, (train_idx, val_idx) in enumerate(cv.split(merged_data, targets), 1):
    print(f'Training with fold {fold_} started.')

    train, val = merged_data.iloc[train_idx], merged_data.iloc[val_idx]

    train_data = lgb.Dataset(train[features], label=train.flag.values)
    val_data = lgb.Dataset(val[features], label=val.flag.values, reference=train_data)

    bst = lgb.train(
        params=tree_params,
        train_set=train_data,
        valid_sets=[train_data, val_data],
        valid_names=['train', 'valid'],
        callbacks=[lgb.early_stopping(stopping_rounds=2), lgb.log_evaluation(period=50)]
    )

    oof[val_idx] = bst.predict(val[features], num_iteration=bst.best_iteration)
    train_preds[train_idx] += bst.predict(train[features], num_iteration=bst.best_iteration) / (cv.n_splits - 1)
    models.append(bst)
    print(f'Training with fold {fold_} completed.')


Training with fold 1 started.




[LightGBM] [Info] Number of positive: 21300, number of negative: 749748
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.458712 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 26582
[LightGBM] [Info] Number of data points in the train set: 771048, number of used features: 126
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.027625 -> initscore=-3.561030
[LightGBM] [Info] Start training from score -3.561030
Training until validation scores don't improve for 2 rounds
[50]	train's auc: 0.741709	valid's auc: 0.733569
[100]	train's auc: 0.756043	valid's auc: 0.745821
[150]	train's auc: 0.763203	valid's auc: 0.751263
[200]	train's auc: 0.768109	valid's auc: 0.755035
[250]	train's auc: 0.771792	valid's auc: 0.757607
[300]	train's auc: 0.775071	valid's auc: 0.759658
Early stopping, best iteration is:
[306]	train's auc: 0.775417	valid's auc: 0.759824
Training with fold 1 completed.
Training with fold 2 start



[LightGBM] [Info] Number of positive: 21263, number of negative: 749786
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.915802 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 26578
[LightGBM] [Info] Number of data points in the train set: 771049, number of used features: 126
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.027577 -> initscore=-3.562819
[LightGBM] [Info] Start training from score -3.562819
Training until validation scores don't improve for 2 rounds
[50]	train's auc: 0.741541	valid's auc: 0.735806
[100]	train's auc: 0.756017	valid's auc: 0.748626
[150]	train's auc: 0.763168	valid's auc: 0.75423
[200]	train's auc: 0.76802	valid's auc: 0.75759
[250]	train's auc: 0.771814	valid's auc: 0.759761
[300]	train's auc: 0.775069	valid's auc: 0.761617
Early stopping, best iteration is:
[304]	train's auc: 0.775329	valid's auc: 0.7617



[LightGBM] [Info] Number of positive: 21268, number of negative: 749781
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.494252 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 26597
[LightGBM] [Info] Number of data points in the train set: 771049, number of used features: 126
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.027583 -> initscore=-3.562578
[LightGBM] [Info] Start training from score -3.562578
Training until validation scores don't improve for 2 rounds
[50]	train's auc: 0.73978	valid's auc: 0.738215
[100]	train's auc: 0.754901	valid's auc: 0.75175
[150]	train's auc: 0.761721	valid's auc: 0.757028
[200]	train's auc: 0.766646	valid's auc: 0.760752
[250]	train's auc: 0.770507	valid's auc: 0.763305
[300]	train's auc: 0.773883	valid's auc: 0.765486
Early stopping, best iteration is:
[338]	train's auc: 0.775954	valid's auc: 0.76665
Training with fold 3 completed.
Training with fold 4 started.



[LightGBM] [Info] Number of positive: 21249, number of negative: 749800
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.254721 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 26585
[LightGBM] [Info] Number of data points in the train set: 771049, number of used features: 126
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.027559 -> initscore=-3.563497
[LightGBM] [Info] Start training from score -3.563497
Training until validation scores don't improve for 2 rounds
[50]	train's auc: 0.741029	valid's auc: 0.730437
[100]	train's auc: 0.756072	valid's auc: 0.744672
[150]	train's auc: 0.763227	valid's auc: 0.75036
[200]	train's auc: 0.768243	valid's auc: 0.75434
[250]	train's auc: 0.772044	valid's auc: 0.756996
[300]	train's auc: 0.775198	valid's auc: 0.759049
[350]	train's auc: 0.777935	valid's auc: 0.760596
[400]	train's auc: 0.780502	valid's auc: 0.761935
Early stopping, best iteration is:
[404]	trai



[LightGBM] [Info] Number of positive: 21228, number of negative: 749821
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.437962 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 26656
[LightGBM] [Info] Number of data points in the train set: 771049, number of used features: 126
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.027531 -> initscore=-3.564513
[LightGBM] [Info] Start training from score -3.564513
Training until validation scores don't improve for 2 rounds
[50]	train's auc: 0.738671	valid's auc: 0.737027
[100]	train's auc: 0.753891	valid's auc: 0.750886
[150]	train's auc: 0.761097	valid's auc: 0.756315
[200]	train's auc: 0.76644	valid's auc: 0.760125
[250]	train's auc: 0.770219	valid's auc: 0.762406
[300]	train's auc: 0.773495	valid's auc: 0.764325
[350]	train's auc: 0.776508	valid's auc: 0.765992
[400]	train's auc: 0.779048	valid's auc: 0.767199
Early stopping, best iteration is:
[411]	tra

### Скоры

In [39]:
'Train roc-auc', roc_auc_score(targets, train_preds)


('Train roc-auc', 0.778674690234171)

In [40]:
'CV roc-auc', roc_auc_score(targets, oof)


('CV roc-auc', 0.7635313699625546)

## Обучение модели Catboost на GPU

In [13]:
targets = merged_data.flag.values

cv = KFold(n_splits=5, random_state=100, shuffle=True)

oof = np.zeros(len(merged_data))
train_preds = np.zeros(len(merged_data))

models = []


tree_params = {
    'max_depth': 5,
    'eval_metric': 'AUC',
    'loss_function': 'Logloss',
    'random_state': 100,
    'l2_leaf_reg': 1,
    'task_type': 'GPU' # если имеется GPU с CUDA, то можно ее задействовать, чтобы на порядок ускорить обучение
}


for fold_, (train_idx, val_idx) in enumerate(cv.split(merged_data, targets), 1):
    print(f'Training with fold {fold_} started.')
    model = cb.CatBoostClassifier(**tree_params)

    train, val = merged_data.iloc[train_idx], merged_data.iloc[val_idx]

    train_pool = cb.Pool(train[features], train.flag.values)
    val_pool = cb.Pool(val[features], val.flag.values)


    model.fit(train_pool, eval_set=[val_pool], early_stopping_rounds=100, verbose_eval=50, use_best_model=True, plot=False)

    oof[val_idx] = model.predict_proba(val_pool)[:, 1]

    train_preds[train_idx] += model.predict_proba(train_pool)[:, 1] / (cv.n_splits-1)
    models.append(model)
    print(f'Training with fold {fold_} completed.')

Training with fold 1 started.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.6316783	best: 0.6316783 (0)	total: 494ms	remaining: 8m 13s
50:	test: 0.7359870	best: 0.7359870 (50)	total: 1.49s	remaining: 27.7s
100:	test: 0.7418430	best: 0.7418430 (100)	total: 2.35s	remaining: 20.9s
150:	test: 0.7456670	best: 0.7456670 (150)	total: 3.28s	remaining: 18.4s
200:	test: 0.7489372	best: 0.7489372 (200)	total: 4.1s	remaining: 16.3s
250:	test: 0.7513135	best: 0.7513135 (250)	total: 5.01s	remaining: 15s
300:	test: 0.7527898	best: 0.7527898 (300)	total: 5.81s	remaining: 13.5s
350:	test: 0.7540968	best: 0.7540968 (350)	total: 6.59s	remaining: 12.2s
400:	test: 0.7551830	best: 0.7551838 (399)	total: 7.38s	remaining: 11s
450:	test: 0.7561705	best: 0.7561705 (450)	total: 8.14s	remaining: 9.91s
500:	test: 0.7569168	best: 0.7569168 (500)	total: 8.94s	remaining: 8.9s
550:	test: 0.7576693	best: 0.7576693 (550)	total: 10.6s	remaining: 8.63s
600:	test: 0.7582715	best: 0.7582733 (598)	total: 13.2s	remaining: 8.78s
650:	test: 0.7588412	best: 0.7588437 (649)	total: 14.3s	remain

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.6273480	best: 0.6273480 (0)	total: 30.8ms	remaining: 30.8s
50:	test: 0.7359306	best: 0.7359306 (50)	total: 1.01s	remaining: 18.8s
100:	test: 0.7432002	best: 0.7432002 (100)	total: 1.88s	remaining: 16.7s
150:	test: 0.7476014	best: 0.7476014 (150)	total: 2.72s	remaining: 15.3s
200:	test: 0.7508373	best: 0.7508373 (200)	total: 3.52s	remaining: 14s
250:	test: 0.7531317	best: 0.7531317 (250)	total: 4.33s	remaining: 12.9s
300:	test: 0.7547815	best: 0.7547815 (300)	total: 5.21s	remaining: 12.1s
350:	test: 0.7561931	best: 0.7561931 (350)	total: 6s	remaining: 11.1s
400:	test: 0.7574802	best: 0.7574802 (400)	total: 6.78s	remaining: 10.1s
450:	test: 0.7584585	best: 0.7584585 (450)	total: 7.56s	remaining: 9.21s
500:	test: 0.7592531	best: 0.7592531 (500)	total: 8.35s	remaining: 8.31s
550:	test: 0.7598011	best: 0.7598019 (546)	total: 9.36s	remaining: 7.63s
600:	test: 0.7604136	best: 0.7604136 (600)	total: 11.7s	remaining: 7.76s
650:	test: 0.7610429	best: 0.7610429 (650)	total: 13.4s	remai

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.6359841	best: 0.6359841 (0)	total: 31.2ms	remaining: 31.2s
50:	test: 0.7397622	best: 0.7397622 (50)	total: 1s	remaining: 18.7s
100:	test: 0.7455820	best: 0.7455820 (100)	total: 1.89s	remaining: 16.8s
150:	test: 0.7497079	best: 0.7497079 (150)	total: 2.72s	remaining: 15.3s
200:	test: 0.7527815	best: 0.7527815 (200)	total: 3.57s	remaining: 14.2s
250:	test: 0.7555961	best: 0.7555961 (250)	total: 4.47s	remaining: 13.3s
300:	test: 0.7571081	best: 0.7571081 (300)	total: 6.54s	remaining: 15.2s
350:	test: 0.7585376	best: 0.7585376 (350)	total: 9.13s	remaining: 16.9s
400:	test: 0.7598209	best: 0.7598209 (400)	total: 10.6s	remaining: 15.9s
450:	test: 0.7607880	best: 0.7607880 (450)	total: 13s	remaining: 15.8s
500:	test: 0.7617958	best: 0.7617958 (500)	total: 14.6s	remaining: 14.5s
550:	test: 0.7625109	best: 0.7625109 (550)	total: 15.3s	remaining: 12.5s
600:	test: 0.7632868	best: 0.7632870 (599)	total: 16.1s	remaining: 10.7s
650:	test: 0.7640816	best: 0.7640816 (650)	total: 16.9s	remai

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.6322695	best: 0.6322695 (0)	total: 31.2ms	remaining: 31.2s
50:	test: 0.7324776	best: 0.7325769 (49)	total: 999ms	remaining: 18.6s
100:	test: 0.7393034	best: 0.7393034 (100)	total: 1.88s	remaining: 16.7s
150:	test: 0.7441899	best: 0.7441899 (150)	total: 2.73s	remaining: 15.3s
200:	test: 0.7478303	best: 0.7478303 (200)	total: 3.59s	remaining: 14.3s
250:	test: 0.7500377	best: 0.7500377 (250)	total: 4.41s	remaining: 13.2s
300:	test: 0.7517632	best: 0.7517632 (300)	total: 5.24s	remaining: 12.2s
350:	test: 0.7528809	best: 0.7528809 (350)	total: 6.03s	remaining: 11.2s
400:	test: 0.7540504	best: 0.7540504 (400)	total: 7.38s	remaining: 11s
450:	test: 0.7550642	best: 0.7550642 (450)	total: 9.52s	remaining: 11.6s
500:	test: 0.7560252	best: 0.7560252 (500)	total: 11.5s	remaining: 11.5s
550:	test: 0.7567511	best: 0.7567511 (550)	total: 12.3s	remaining: 10s
600:	test: 0.7574911	best: 0.7574911 (600)	total: 13.1s	remaining: 8.67s
650:	test: 0.7579889	best: 0.7579889 (650)	total: 13.9s	rema

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.6386386	best: 0.6386386 (0)	total: 32ms	remaining: 31.9s
50:	test: 0.7407731	best: 0.7407731 (50)	total: 1.01s	remaining: 18.8s
100:	test: 0.7469451	best: 0.7469451 (100)	total: 1.89s	remaining: 16.8s
150:	test: 0.7506180	best: 0.7506180 (150)	total: 2.75s	remaining: 15.5s
200:	test: 0.7539460	best: 0.7539460 (200)	total: 3.58s	remaining: 14.2s
250:	test: 0.7562898	best: 0.7562898 (250)	total: 4.4s	remaining: 13.1s
300:	test: 0.7581371	best: 0.7581371 (300)	total: 5.2s	remaining: 12.1s
350:	test: 0.7596453	best: 0.7596456 (349)	total: 6.06s	remaining: 11.2s
400:	test: 0.7606209	best: 0.7606209 (400)	total: 6.9s	remaining: 10.3s
450:	test: 0.7616511	best: 0.7616511 (450)	total: 8.11s	remaining: 9.87s
500:	test: 0.7623832	best: 0.7623832 (500)	total: 10.5s	remaining: 10.4s
550:	test: 0.7629711	best: 0.7629711 (550)	total: 12.3s	remaining: 10s
600:	test: 0.7635171	best: 0.7635171 (600)	total: 13.1s	remaining: 8.69s
650:	test: 0.7640242	best: 0.7640242 (650)	total: 13.9s	remaini

### Скоры

In [14]:
'Train roc-auc', roc_auc_score(targets, train_preds)


('Train roc-auc', 0.7758897350547598)

In [15]:
'CV roc-auc', roc_auc_score(targets, oof)


('CV roc-auc', 0.7641613449172037)