In [65]:
import os
from typing import List, Optional
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier, Pool
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer
import re

In [66]:
# Создаём внутреннюю папку проекта
os.makedirs('pipeline', exist_ok=True) 

# Lists of features for the functions

## Basic lists

In [67]:
# rn - уникальный признак

# Бинаризированные
pre_features = [
    'pre_since_opened',
    'pre_since_confirmed',
    'pre_pterm',
    'pre_fterm',
    'pre_till_pclose',
    'pre_till_fclose',
    'pre_loans_credit_limit',
    'pre_loans_next_pay_summ',
    'pre_loans_outstanding',
    'pre_loans_max_overdue_sum',
    'pre_loans_credit_cost_rate',
    'pre_loans5',
    'pre_loans530',
    'pre_loans3060',
    'pre_loans6090',
    'pre_loans90',
    'pre_util',
    'pre_over2limit',
    'pre_maxover2limit'
]

# Закодированные
enc_features = [
    'enc_loans_account_holder_type',
    'enc_loans_credit_status',
    'enc_loans_credit_type',
    'enc_loans_account_cur'
]

# Статусы ежемесячных платежей
enc_paym_features = [
    'enc_paym_0',
    'enc_paym_1',
    'enc_paym_2',
    'enc_paym_3',
    'enc_paym_4',
    'enc_paym_5',
    'enc_paym_6',
    'enc_paym_7',
    'enc_paym_8',
    'enc_paym_9',
    'enc_paym_10',
    'enc_paym_11',
    'enc_paym_12',
    'enc_paym_13',
    'enc_paym_14',
    'enc_paym_15',
    'enc_paym_16',
    'enc_paym_17',
    'enc_paym_18',
    'enc_paym_19',
    'enc_paym_20',
    'enc_paym_21',
    'enc_paym_22',
    'enc_paym_23',
    'enc_paym_24'
]

#  Флаги
flag_features = [
    'is_zero_loans5',
    'is_zero_loans530',
    'is_zero_loans3060',
    'is_zero_loans6090',
    'is_zero_loans90',
    'is_zero_util',
    'is_zero_over2limit',
    'is_zero_maxover2limit',
    'pclose_flag',
    'fclose_flag'
]

In [68]:
df_source = pd.read_csv('prepared_data/source_data_train_1.csv')
df_source.shape

(20931476, 61)

In [69]:
df_result = pd.read_csv('prepared_data/cut_corr_imp_train.csv')
df_result.shape

(2400000, 61)

In [70]:
df_source_columns = df_source.columns.tolist()
df_source_columns[:10]

['id',
 'rn',
 'pre_since_opened',
 'pre_since_confirmed',
 'pre_pterm',
 'pre_fterm',
 'pre_till_pclose',
 'pre_till_fclose',
 'pre_loans_credit_limit',
 'pre_loans_next_pay_summ']

In [71]:
df_result_columns = df_result.columns.tolist()
df_result_columns[:10]

['id',
 'flag',
 'is_zero_sum_prop_1',
 'enc_paym_avg_0_1_this_year_diff',
 'pre_util_prop_3',
 'enc_loans_credit_type_prop_0',
 'pre_till_pclose_prop_10',
 'pre_util_prop_6',
 'pre_loans_outstanding_prop_1',
 'pre_util_mean_freq']

## List of features to download from the original dataset

In [72]:
"""
Формируем список колонок из df_source_columns,
которые НЕ встречаются ни в одном названии из df_result_columns как подстрока.
"""
drop_list = []
for col_source in df_source_columns:
    found = False
    for col_result in df_result_columns:
        if col_source in col_result:
            found = True
            break
    if not found:
        drop_list.append(col_source)

print(len(drop_list))
drop_list

30


['pre_loans_total_overdue',
 'pre_loans3060',
 'pre_loans6090',
 'pre_loans90',
 'is_zero_loans3060',
 'is_zero_loans6090',
 'is_zero_loans90',
 'pre_maxover2limit',
 'is_zero_util',
 'is_zero_maxover2limit',
 'enc_paym_3',
 'enc_paym_4',
 'enc_paym_5',
 'enc_paym_6',
 'enc_paym_7',
 'enc_paym_11',
 'enc_paym_12',
 'enc_paym_13',
 'enc_paym_14',
 'enc_paym_15',
 'enc_paym_16',
 'enc_paym_17',
 'enc_paym_18',
 'enc_paym_19',
 'enc_paym_20',
 'enc_paym_21',
 'enc_paym_22',
 'enc_paym_23',
 'pclose_flag',
 'fclose_flag']

In [73]:
needed_columns = [x for x in df_source_columns if x not in drop_list]

print(len(needed_columns))
needed_columns[:10]

31


['id',
 'rn',
 'pre_since_opened',
 'pre_since_confirmed',
 'pre_pterm',
 'pre_fterm',
 'pre_till_pclose',
 'pre_till_fclose',
 'pre_loans_credit_limit',
 'pre_loans_next_pay_summ']

In [74]:
"""
Добавим недостающие признаки из групп flag_features и enc_paym _features, 
для правильной работы функций обрабатывающих эти группы. 
"""
features_list= [
    'is_zero_loans3060',
    'is_zero_loans6090',
    'is_zero_loans90',
    'enc_paym_3',
    'enc_paym_4',
    'enc_paym_5',
    'enc_paym_6',
    'enc_paym_7',
    'enc_paym_11',
    'enc_paym_12',
    'enc_paym_13',
    'enc_paym_14',
    'enc_paym_15',
    'enc_paym_16',
    'enc_paym_17',
    'enc_paym_18',
    'enc_paym_19',
    'enc_paym_20',
    'enc_paym_21',
    'enc_paym_22',
    'enc_paym_23'
]

# Список признаков для скачивания из исходного датасета
needed_columns = needed_columns + features_list

print(len(needed_columns))
needed_columns[:10]

52


['id',
 'rn',
 'pre_since_opened',
 'pre_since_confirmed',
 'pre_pterm',
 'pre_fterm',
 'pre_till_pclose',
 'pre_till_fclose',
 'pre_loans_credit_limit',
 'pre_loans_next_pay_summ']

## Create_definite_value_proportion_features_pipeline funtion list

In [75]:
# Создадим список пропорциональных фичей в итоговом датасете
prop_features_result_list = [col for col in df_result_columns if 'prop_' in col]

print(len(prop_features_result_list))
prop_features_result_list

38


['is_zero_sum_prop_1',
 'pre_util_prop_3',
 'enc_loans_credit_type_prop_0',
 'pre_till_pclose_prop_10',
 'pre_util_prop_6',
 'pre_loans_outstanding_prop_1',
 'pre_loans_credit_limit_prop_2',
 'pre_loans_credit_cost_rate_prop_6',
 'pre_loans_outstanding_prop_5',
 'pre_loans_credit_cost_rate_prop_11',
 'pre_loans_credit_cost_rate_prop_4',
 'pre_loans_next_pay_summ_prop_5',
 'pre_since_opened_prop_12',
 'pre_loans_credit_limit_prop_15',
 'enc_loans_credit_type_prop_2',
 'pre_fterm_prop_7',
 'enc_paym_0_prop_1',
 'is_zero_over2limit_prop_1',
 'pre_since_opened_prop_8',
 'pre_loans_max_overdue_sum_prop_1',
 'pre_loans_next_pay_summ_prop_0',
 'pre_pterm_prop_6',
 'pre_since_opened_prop_19',
 'is_zero_loans5_prop_1',
 'enc_loans_account_holder_type_prop_4',
 'pre_loans_credit_limit_prop_18',
 'pre_till_fclose_prop_4',
 'pre_pterm_prop_3',
 'is_zero_loans530_prop_1',
 'enc_loans_credit_status_prop_5',
 'pre_since_confirmed_prop_4',
 'pre_fterm_prop_3',
 'pre_till_fclose_prop_3',
 'pre_till_fcl

In [76]:
# Создадим список признаков исходного датасета из которых были сделаны пропорциональные фичи
prop_features_source_list = list(
    set(
        [
            re.sub(r'_prop.*$', '', col)
            for col in prop_features_result_list
        ]
    )
)

print(len(prop_features_source_list))
prop_features_source_list

22


['pre_till_pclose',
 'is_zero_loans5',
 'pre_since_opened',
 'enc_loans_credit_type',
 'pre_since_confirmed',
 'pre_fterm',
 'pre_till_fclose',
 'enc_paym_24',
 'enc_paym_0',
 'enc_loans_credit_status',
 'pre_loans_max_overdue_sum',
 'enc_loans_account_holder_type',
 'pre_loans_outstanding',
 'pre_loans_credit_cost_rate',
 'is_zero_over2limit',
 'pre_util',
 'pre_loans_next_pay_summ',
 'is_zero_sum',
 'pre_loans_credit_limit',
 'pre_over2limit',
 'is_zero_loans530',
 'pre_pterm']

In [77]:
# Соберем часть словаря пропорциональных фичей для пайплайна
prop_features_dict = {}

for source_col in prop_features_source_list:
    # Инициализируем пустой список для каждого исходного признака
    prop_features_dict[source_col] = []
    # Создадим паттерн: имя col в начале и после него подчёркивание или конец строки
    pattern = re.compile(r'^' + source_col + r'(_|$)')
    for result_col in prop_features_result_list:
        # Проверяем, совпадает ли имя признака с паттерном
        if pattern.match(result_col):
            # Ищем число в конце строки
            match = re.search(r'(\d+)$', result_col)
            # Добавляем найденное число в список для данного source_col
            prop_features_dict[source_col].append(int(match.group(1)))
prop_features_dict

{'pre_till_pclose': [10, 7],
 'is_zero_loans5': [1],
 'pre_since_opened': [12, 8, 19],
 'enc_loans_credit_type': [0, 2],
 'pre_since_confirmed': [4, 7],
 'pre_fterm': [7, 3],
 'pre_till_fclose': [4, 3, 1],
 'enc_paym_24': [1],
 'enc_paym_0': [1],
 'enc_loans_credit_status': [5],
 'pre_loans_max_overdue_sum': [1],
 'enc_loans_account_holder_type': [4],
 'pre_loans_outstanding': [1, 5],
 'pre_loans_credit_cost_rate': [6, 11, 4],
 'is_zero_over2limit': [1],
 'pre_util': [3, 6],
 'pre_loans_next_pay_summ': [5, 0],
 'is_zero_sum': [1],
 'pre_loans_credit_limit': [2, 15, 18],
 'pre_over2limit': [17],
 'is_zero_loans530': [1],
 'pre_pterm': [6, 3]}

In [78]:
"""
Добавим в словарь недостающие is_zero_loans* для функции суммирования.
Удалим is_zero_sum, фича is_zero_sum_prop_1 будет собираться другой функцией.
"""
is_zero_loans_list = [
        'is_zero_loans5',
        'is_zero_loans530',
        'is_zero_loans3060',
        'is_zero_loans6090',
        'is_zero_loans90'
    ]
for col in is_zero_loans_list:
    if col not in prop_features_dict.keys():
        prop_features_dict[col] = [1]
        
del prop_features_dict['is_zero_sum']

prop_features_dict

{'pre_till_pclose': [10, 7],
 'is_zero_loans5': [1],
 'pre_since_opened': [12, 8, 19],
 'enc_loans_credit_type': [0, 2],
 'pre_since_confirmed': [4, 7],
 'pre_fterm': [7, 3],
 'pre_till_fclose': [4, 3, 1],
 'enc_paym_24': [1],
 'enc_paym_0': [1],
 'enc_loans_credit_status': [5],
 'pre_loans_max_overdue_sum': [1],
 'enc_loans_account_holder_type': [4],
 'pre_loans_outstanding': [1, 5],
 'pre_loans_credit_cost_rate': [6, 11, 4],
 'is_zero_over2limit': [1],
 'pre_util': [3, 6],
 'pre_loans_next_pay_summ': [5, 0],
 'pre_loans_credit_limit': [2, 15, 18],
 'pre_over2limit': [17],
 'is_zero_loans530': [1],
 'pre_pterm': [6, 3],
 'is_zero_loans3060': [1],
 'is_zero_loans6090': [1],
 'is_zero_loans90': [1]}

## List for create_mean_value_frequency_feature_pipeline features

In [79]:
# Соберем список всех фичей средней частотности в итоговом датасете
mean_freq_result_list = [col for col in df_result_columns if 'mean_freq' in col]

print(len(mean_freq_result_list))
mean_freq_result_list

16


['pre_util_mean_freq',
 'pre_loans_credit_limit_mean_freq',
 'pre_since_opened_mean_freq',
 'pre_loans_credit_cost_rate_mean_freq',
 'enc_loans_credit_type_mean_freq',
 'pre_loans_next_pay_summ_mean_freq',
 'pre_since_confirmed_mean_freq',
 'pre_pterm_mean_freq',
 'enc_paym_0_mean_freq',
 'enc_loans_account_holder_type_mean_freq',
 'pre_loans530_mean_freq',
 'enc_paym_8_mean_freq',
 'pre_loans5_mean_freq',
 'enc_paym_10_mean_freq',
 'enc_loans_account_cur_mean_freq',
 'enc_paym_9_mean_freq']

In [80]:
"""
Соберем список признаков исходного датасета 
из которых были сделаны фичи средней частотности.
"""
mean_freq_source_list = [x[:-len('_mean_freq')] for x in mean_freq_result_list]
print(len(mean_freq_source_list))
mean_freq_source_list

16


['pre_util',
 'pre_loans_credit_limit',
 'pre_since_opened',
 'pre_loans_credit_cost_rate',
 'enc_loans_credit_type',
 'pre_loans_next_pay_summ',
 'pre_since_confirmed',
 'pre_pterm',
 'enc_paym_0',
 'enc_loans_account_holder_type',
 'pre_loans530',
 'enc_paym_8',
 'pre_loans5',
 'enc_paym_10',
 'enc_loans_account_cur',
 'enc_paym_9']

## Drop list

In [81]:
temporary_features_list = [
    'enc_paym_avg_1_all',
    'enc_paym_avg_2_all',
    'enc_paym_avg_0_this_year',
    'enc_paym_avg_1_this_year',
    'enc_paym_avg_0_last_year',
    'is_zero_loans3060_prop_1',
    'is_zero_loans6090_prop_1',
    'is_zero_loans90_prop_1'
]

In [82]:
drop_list = needed_columns + temporary_features_list
print(len(drop_list))
drop_list

60


['id',
 'rn',
 'pre_since_opened',
 'pre_since_confirmed',
 'pre_pterm',
 'pre_fterm',
 'pre_till_pclose',
 'pre_till_fclose',
 'pre_loans_credit_limit',
 'pre_loans_next_pay_summ',
 'pre_loans_outstanding',
 'pre_loans_max_overdue_sum',
 'pre_loans_credit_cost_rate',
 'pre_loans5',
 'pre_loans530',
 'is_zero_loans5',
 'is_zero_loans530',
 'pre_util',
 'pre_over2limit',
 'is_zero_over2limit',
 'enc_paym_0',
 'enc_paym_1',
 'enc_paym_2',
 'enc_paym_8',
 'enc_paym_9',
 'enc_paym_10',
 'enc_paym_24',
 'enc_loans_account_holder_type',
 'enc_loans_credit_status',
 'enc_loans_credit_type',
 'enc_loans_account_cur',
 'is_zero_loans3060',
 'is_zero_loans6090',
 'is_zero_loans90',
 'enc_paym_3',
 'enc_paym_4',
 'enc_paym_5',
 'enc_paym_6',
 'enc_paym_7',
 'enc_paym_11',
 'enc_paym_12',
 'enc_paym_13',
 'enc_paym_14',
 'enc_paym_15',
 'enc_paym_16',
 'enc_paym_17',
 'enc_paym_18',
 'enc_paym_19',
 'enc_paym_20',
 'enc_paym_21',
 'enc_paym_22',
 'enc_paym_23',
 'enc_paym_avg_1_all',
 'enc_paym_av

# Downloading dataset and target

In [83]:
# СКАЧИВАЕМ ИСХОДНЫЙ ДАТАСЕТ

def read_parquet_dataset_from_local(
    path_to_dataset: str,
    start_from: int = 0,
    num_parts_to_read: int = 2,
    columns: Optional[List[str]] = None,
    verbose: bool = False
) -> pd.DataFrame:
    """
    Читает num_parts_to_read партиций, преобразовывает их к pd.DataFrame и возвращает.

    Args:
        path_to_dataset : путь до директории с партициями
        start_from : номер партиции, с которой нужно начать чтение
        num_parts_to_read : количество партиций, которые требуется прочитать
        columns : список колонок, которые нужно прочитать из партиции
        verbose : выводить ли дополнительную информацию
    
    Returns:
        pd.DataFrame 
    """
    res = []
    dataset_paths = sorted(
        os.path.join(path_to_dataset, filename)
        for filename in os.listdir(path_to_dataset)
        if filename.startswith('train')
    )

    if verbose:
        print('Dataset paths:')
        for path in dataset_paths:
            print(path)

    start_from = max(0, start_from)
    chunks = dataset_paths[start_from: start_from + num_parts_to_read]

    if verbose:
        print('Reading chunks:')
        for chunk in chunks:
            print(chunk)

    for chunk_path in tqdm(chunks, desc="Reading dataset with pandas"):
        if verbose:
            print('Reading chunk:', chunk_path)
        chunk = pd.read_parquet(chunk_path, columns=columns)
        res.append(chunk)

    return pd.concat(res).reset_index(drop=True)

def prepare_transactions_dataset(
    path_to_dataset: str,
    num_parts_to_preprocess_at_once: int = 1,
    num_parts_total: int = 50,
    save_to_path: str = None,
    verbose: bool = False,
    columns: Optional[List[str]] = None
) -> pd.DataFrame:
    """
    Возвращает исходный pd.DataFrame с признаками из которых нужно собрать
    учебный датасет.

    Args:
        path_to_dataset : путь до датасета с партициями
        num_parts_to_preprocess_at_once : количество партиций, 
            которые будут одновременно держаться и обрабатываться в памяти
        num_parts_total : общее количество партиций, которые нужно обработать
        save_to_path : путь до папки для сохранения обработанных блоков в .parquet-формате; 
            если None, сохранение не происходит
        verbose : логировать каждую обрабатываемую часть данных
        columns : список колонок, которые нужно оставить

    Returns:
        pd.DataFrame : датафрейм с объединёнными данными
    """
    preprocessed_frames = []

    for step in tqdm(range(0, num_parts_total, num_parts_to_preprocess_at_once),
                     desc="Transforming transactions data"):
        transactions_frame = read_parquet_dataset_from_local(
            path_to_dataset,
            start_from=step,
            num_parts_to_read=num_parts_to_preprocess_at_once,
            verbose=verbose,
            columns=columns
        )

       # Записываем подготовленные данные в файл
        if save_to_path:
            block_as_str = str(step)
            if len(block_as_str) == 1:
                block_as_str = '00' + block_as_str
            else:
                block_as_str = '0' + block_as_str
            transactions_frame.to_parquet(os.path.join(save_to_path, f'processed_chunk_{block_as_str}.parquet'))

        preprocessed_frames.append(transactions_frame)
    
    return pd.concat(preprocessed_frames)

In [84]:
"""
Собираем исходный датасет из parquet файлов,  
скачиваем только необходимые колонки
"""
# Путь до данных в проекте
path = 'train_data/'

data = prepare_transactions_dataset(
    path,
    num_parts_to_preprocess_at_once=1,
    num_parts_total=12,
    save_to_path='train_data/',
    columns=needed_columns) 

# Загружаем датасет с целевой переменной
target = pd.read_csv('train_target.csv')

# Делим датасет с целевой переменной на train/test части
y_train, y_test  = train_test_split(target, train_size=0.8, random_state=0, stratify=target.flag)

# Забираем наборы id из train/test
train_id = y_train['id'].values
test_id = y_test['id'].values

# На основе наборов id делим исходный датасет на train/test части
X_train = data.set_index('id').loc[train_id].reset_index()
X_test = data.set_index('id').loc[test_id].reset_index()

# Сбросим индексы для приведения к единому виду с X_train/X_test 
y_train = y_train.reset_index(drop=True)['flag']
y_test = y_test.reset_index(drop=True)['flag']

X_train.shape, X_test.shape, y_train.shape, y_test.shape

Transforming transactions data:   0%|          | 0/12 [00:00<?, ?it/s]

Reading dataset with pandas:   0%|          | 0/1 [00:00<?, ?it/s]

Reading dataset with pandas:   0%|          | 0/1 [00:00<?, ?it/s]

Reading dataset with pandas:   0%|          | 0/1 [00:00<?, ?it/s]

Reading dataset with pandas:   0%|          | 0/1 [00:00<?, ?it/s]

Reading dataset with pandas:   0%|          | 0/1 [00:00<?, ?it/s]

Reading dataset with pandas:   0%|          | 0/1 [00:00<?, ?it/s]

Reading dataset with pandas:   0%|          | 0/1 [00:00<?, ?it/s]

Reading dataset with pandas:   0%|          | 0/1 [00:00<?, ?it/s]

Reading dataset with pandas:   0%|          | 0/1 [00:00<?, ?it/s]

Reading dataset with pandas:   0%|          | 0/1 [00:00<?, ?it/s]

Reading dataset with pandas:   0%|          | 0/1 [00:00<?, ?it/s]

Reading dataset with pandas:   0%|          | 0/1 [00:00<?, ?it/s]

((20931476, 52), (5231241, 52), (2400000,), (600000,))

In [144]:
# Сохраним разделённые данные
X_train.to_csv('pipeline/X_train.csv', index=False)
X_test.to_csv('pipeline/X_test.csv', index=False)
y_train.to_csv('pipeline/y_train.csv', index=False)
y_test.to_csv('pipeline/y_test.csv', index=False)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((20931476, 52), (5231241, 52), (2400000,), (600000,))

# Pipeline

In [85]:
# Загружаем исходные разделённые данные 
X_train = pd.read_csv('pipeline/X_train.csv')
X_test = pd.read_csv('pipeline/X_test.csv')
y_train = pd.read_csv('pipeline/y_train.csv')
y_test = pd.read_csv('pipeline/y_test.csv')
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((20931476, 52), (5231241, 52), (2400000, 1), (600000, 1))

In [86]:
"""
Для полной проверки будем использовать тестовые данные 
данные как меньшие по объёму.
"""
X_test_full = X_test.copy()
X_test_full.shape

(5231241, 52)

In [87]:
# DATA PREPROCESSING

def convert_all_to_numeric(
    df: pd.DataFrame
) -> pd.DataFrame:
    """
    Преобразует типы всех колоноки в числовые с заменой ошибок на NaN.

    Args:
        df: Исходный DataFrame, содержащий колонки 'id' и 'rn'.

    Returns:
        pandas.DataFrame: Копия исходного DataFrame с добавленной колонкой 'rn_max'.
    """
    return df.apply(lambda col: pd.to_numeric(col, errors='coerce'))

In [88]:
# FEATURE ENGENERING FUNCTIONS

def create_rn_max_feature_pipeline(
    df: pd.DataFrame
) -> pd.DataFrame:
    """
    Добавляет в DataFrame новую колонку 'rn_max' — максимальное 
    значение 'rn' для каждой группы 'id'.

    Args:
        df: Исходный DataFrame, содержащий колонки 'id' и 'rn'.

    Returns:
        pandas.DataFrame: Копия исходного DataFrame с добавленной колонкой 'rn_max'.
    """
    print('FUNCTION create_rn_max_feature ')
    df = df.copy()
    # Вычисляем максимальное значение 'rn' для каждой группы 'id'
    group_value = df.groupby('id')['rn'].max().rename('rn_max')

    # Объедииняем исходный DataFrame с результатом группировки по 'id'
    df = df.merge(group_value, on='id', how='left')
    
    return df

def enc_paym_transcoding_pipeline(
    df: pd.DataFrame
) -> pd.DataFrame:
    """ 
    Прекодирует признаки enc_paym_features к единому виду с диапазоном значений {0, 1, 2, 3}.
    Для каждого столбца enc_paym_0, enc_paym_1, ..., enc_paym_24, 
    если в значениях встречается 4, происходит замена:
        1 -> 0
        2 -> 1
        3 -> 2
        4 -> 3

    Args:
        df: Исходный DataFrame с колонками 'enc_paym_0' ... 'enc_paym_24'.

    Returns:
    pandas.DataFrame: Копия DataFrame с перекодированными признаками.
    """
    print('FUNCTION enc_paym_transcoding ')
    df = df.copy()
    # Список колонок для перекодировки
    columns = [f'enc_paym_{i}' for i in range(25)]
    
    for col in columns:
        # Проверяем, есть ли значение 4 в колонке
        if 4 in df[col].unique():
            # Заменяем значения согласно маппингу
            df.loc[:, col] = df[col].replace({1: 0, 2: 1, 3: 2, 4: 3})
            
    return df

def create_definite_value_proportion_features_pipeline(
    df: pd.DataFrame
) -> pd.DataFrame:
    """
    Создаёт и добавляет в датафрейм новые частотные признаки 
    на основе заданных значений исходных признаков.
    
    Для каждого столбца и каждого указанного значения в словаре функция создаёт новые признаки, 
    отражающие долю записей с этим значением относительно общего количества 
    кредитов (rn_max) для каждого id.
    
    Args:
        df: Исходный DataFrame, содержащий необходимые признаки и колонку 'rn_max'.
    
    Returns:
        pandas.DataFrame: Копия исходного DataFrame с добавленными частотными признаками.
    """
    print('FUNCTION create_definite_value_proportion_features ')
    df = df.copy()
    
    features_dictionary = {
        'enc_loans_account_holder_type': [4],
        'pre_pterm': [6, 3],
        'is_zero_loans530': [1],
        'enc_paym_0': [1],
        'pre_loans_credit_cost_rate': [6, 11, 4],
        'pre_loans_next_pay_summ': [5, 0],
        'is_zero_over2limit': [1],
        'pre_loans_outstanding': [1, 5],
        'pre_util': [3, 6],
        'pre_till_pclose': [10, 7],
        'is_zero_loans5': [1],
        'pre_since_confirmed': [4, 7],
        'pre_loans_credit_limit': [2, 15, 18],
        'pre_over2limit': [17],
        'pre_till_fclose': [4, 3, 1],
        'enc_loans_credit_status': [5],
        'pre_since_opened': [12, 8, 19],
        'enc_paym_24': [1],
        'pre_loans_max_overdue_sum': [1],
        'enc_loans_credit_type': [0, 2],
        'pre_fterm': [7, 3],
        'is_zero_loans3060': [1],
        'is_zero_loans6090': [1],
        'is_zero_loans90': [1]
    }   

    
        
    for col in  features_dictionary.keys():
        print('Исходный признак', col)
        print('Новые фичи')
        
        for value in features_dictionary[col]:
            new_column = f'{col}_prop_{value}'
            print(new_column)                     
            
            # Создаём группировку с количеством value для каждого id
            group_value = df[df[col] == value].groupby('id').size().rename(new_column)

            # Объедииняем группировку с  датасетом
            df = df.merge(group_value, on='id', how='left')
            
            # Заполняем пропуски нулями
            df[new_column] = df[new_column].fillna(0)
            
            # Считаем отношение к количеству кредитов
            df[new_column] = df[new_column] / df['rn_max']         

    return df

def from_is_zero_prop_1_create_sum_prop_1_feature_pipeline(
    df: pd.DataFrame
) -> pd.DataFrame:
    """
    Вычисляет среднее значение признаков is_zero_*_prop_1 по строкам и добавляет 
    новый признак 'is_zero_sum_prop_1' в DataFrame.

    Args:
        df:  Исходный DataFrame с признаками is_zero_*_prop_1.

    Returns:
        pandas.DataFrame: Копия DataFrame с добавленным признаком 'is_zero_sum_prop_1'.
    """

    print('FUNCTION of_is_zero_prop_1_create_sum_prop_1_feature ')

    df = df.copy()

    columns = [
        'is_zero_loans5_prop_1',
        'is_zero_loans530_prop_1',
        'is_zero_loans3060_prop_1',
        'is_zero_loans6090_prop_1',
        'is_zero_loans90_prop_1'
    ]

    df['is_zero_sum_prop_1'] = df[columns].sum(axis=1) / 5

    return df

def create_mean_value_frequency_feature_pipeline(
    df: pd.DataFrame
) -> pd.DataFrame:
    """
    Cоздаёт новые агрегированные признаки,
    отражающий среднюю частоту (относительную встречаемость) значений 
    заданных столбцов columns_list датафрейма для каждого уникального id.
    Результат добавляется в  датафрейм 
    с нормировкой на количество записей (rn_max) для каждого id.
    
    Args:
        df :  Исходный DataFrame с признаками из columns_list.
    
    Returns:
        pandas.DataFrame :  Копия DataFrame с добавленным новым столбцом {column}_mean_freq,
        содержащим нормированное агрегированное значение средней 
        частоты значений column для каждого id.
    """
    print('FUNCTION create_mean_value_frequency_feature ')
    
    df = df.copy()
    
    # Список столбцов, для которых считаем среднюю частоту значений
    columns_list = [
        'pre_util',
        'pre_loans_credit_limit',
        'pre_since_opened',
        'pre_loans_credit_cost_rate',
        'enc_loans_credit_type',
        'pre_loans_next_pay_summ',
        'pre_since_confirmed',
        'pre_pterm',
        'enc_paym_0',
        'enc_loans_account_holder_type',
        'pre_loans530',
        'enc_paym_8',
        'pre_loans5',
        'enc_paym_10',
        'enc_loans_account_cur',
        'enc_paym_9'
    ]
    
    for col in columns_list:
        new_column = f'{col}_mean_freq'
        print('new_column', new_column)
        
        # Вычисляем относительную частоту каждого уникального значения в столбце
        bin_freq = df[col].value_counts(normalize=True).to_dict()
        
        # Создаём Series с частотами значений для каждой строки
        freq_series = df[col].map(bin_freq)
        
        # Группируем по 'id' и суммируем частоты значений
        agg_freq = freq_series.groupby(df['id']).sum().reset_index(name=new_column)
        
        # Добавляем новый признак в DataFrame, объединяя по 'id'
        df = df.merge(agg_freq, on='id', how='left')
    
        # Нормируем агрегированные суммы частот на количество записей 'rn_max' для каждого id
        df[new_column] = df[new_column] / df['rn_max']

    return df

def enc_paym_norm_group_sum_diff_pipeline(
    df: pd.DataFrame
) -> pd.DataFrame:
    """ 
    Генерирует признаки разницы между средними количествами различных статусов платежей 
    по кредитам за разные временные промежутки.

    Основная цель функции — создать итоговые признаки:
        - 'enc_paym_avg_0_1_this_year_diff'
        - 'enc_paym_avg_1_2_all_diff'
        - 'enc_paym_avg_0_years_diff'

    Для их расчёта временно создаются промежуточные агрегированные признаки среднего 
    количества статусов платежей по id и периоду 
    (например, 'enc_paym_avg_0_this_year'), 
    которые впоследствии удаляются из итогового датасета.

    Args:
        df :  Исходный DataFrame с признаками из columns_list.
    
    Returns:
        pandas.DataFrame :  Копия DataFrame с добавленными итоговыми признаками 
        разницы между средними количествами статусов платежей по различным периодам.
    """

    print('FUNCTION enc_paym_norm_group_sum_diff_pipeline ')
    df = df.copy()

    # Создаём временный датафрейм со столбцом id из df
    df_buff = pd.DataFrame(data = df['id'], columns = ['id'])
    
    # Временной промежуток 'all' — все периоды
    time_span = 'all'
    columns = [f'enc_paym_{i}' for i in range(25)]

    # Для статусов платежей по кредитам 1 и 2
    for i in range(1, 3):
        new_col = f'enc_paym_avg_{i}_{time_span}'
        print('new_column', new_col)
        
        # Считаем количество статуса i по всем столбцам за период 
        df_buff[new_col] = np.sum(
            [df[col] == i for col in columns],
            axis=0
        )
        
        # Группируем по id и суммируем значения
        agg_sum = (
            df_buff
            .groupby('id')
            [new_col]  
            .sum()
            .reset_index(name=new_col)
        )
        
        # Добавляем группировку в исходный DataFrame
        df = df.merge(agg_sum, on='id', how='left')
        
        # Нормируем сумму на количество записей 'rn_max' 
        df[new_col] = df[new_col] / df['rn_max']
        
    # Временной промежуток 'this_year' — первые 12 месяцев
    time_span = 'this_year'
    columns = [f'enc_paym_{i}' for i in range(12)]

    # Для статусов платежей по кредитам 0 и 1
    for i in range(2):
        new_col = f'enc_paym_avg_{i}_{time_span}'
        print('new_column', new_col)
        
        # Считаем количество статуса i по всем столбцам за период 
        df_buff[new_col] = np.sum(
            [df[col] == i for col in columns],
            axis=0
        )
        
        # Группируем по id и суммируем значения
        agg_sum = (
            df_buff
            .groupby('id')
            [new_col]  
            .sum()
            .reset_index(name=new_col)
        )
        
        # Добавляем группировку в исходный DataFrame
        df = df.merge(agg_sum, on='id', how='left')
        
        # Нормируем сумму на количество записей 'rn_max' 
        df[new_col] = df[new_col] / df['rn_max']
        
    # Временной промежуток 'last_year' — месяцы с 12 по 24
    time_span = 'last_year'
    columns = [f'enc_paym_{i}' for i in range(12, 25)]
    
    """
    Статус платежей  0.
    (Оставим цикл для единообразия кода)
    """
    for i in [0]:
        new_col = f'enc_paym_avg_{i}_{time_span}'
        print('new_column', new_col)
        
        # Считаем количество статуса i по всем столбцам за период 
        df_buff[new_col] = np.sum(
            [df[old_col] == i for old_col in columns],
            axis=0
        )
        
        # Группируем по id и суммируем значения
        agg_sum = (
            df_buff
            .groupby('id')
            [new_col]  
            .sum()
            .reset_index(name=new_col)
        )
        
        # Добавляем группировку в исходный DataFrame
        df = df.merge(agg_sum, on='id', how='left')
        
        # Нормируем сумму на количество записей 'rn_max' 
        df[new_col] = df[new_col] / df['rn_max']

    # Создаём фичи разницы 
    df['enc_paym_avg_0_1_this_year_diff'] = (
            df['enc_paym_avg_0_this_year'] - 
            df['enc_paym_avg_1_this_year']
    )

    df['enc_paym_avg_1_2_all_diff'] = (
            df['enc_paym_avg_1_all'] - 
            df['enc_paym_avg_2_all']
    )

    df['enc_paym_avg_0_years_diff'] = (
            df['enc_paym_avg_0_this_year'] - 
            df['enc_paym_avg_0_last_year']
    )
    
    print('new diff columns')
    print('enc_paym_avg_0_1_this_year_diff')
    print('enc_paym_avg_1_2_all_diff')
    print('enc_paym_avg_0_years_diff')

    return df

def pre_since_opened_sum_mean_repeated_pipeline(
    df: pd.DataFrame
) -> pd.DataFrame:
    """
    Cоздаёт признак, отражающий пропорцию повторяющихся значений 'pre_since_opened'
    для каждого 'id'.

    Логика работы:
    - Подсчитывает количество появлений каждого значения 'pre_since_opened' для каждого 'id'.
    - Выделяет только повторяющиеся значения (где количество > 1) и вычитает 1,
      чтобы не считать первое появление.
    - Суммирует количество повторов по всем значениям 'pre_since_opened' для каждого 'id'.
    - Добавляет отсутствующие 'id' с нулевыми значениями повторов.
    - Добавляет новый признак 'pre_since_opened_repeated_prop' в df_to_update,
      нормируя сумму повторов на количество записей 'rn_max' для каждого 'id'.

    Args:
        df :  Исходный DataFrame с признаками  'pre_since_opened', 'id' и 'rn_max'.
    
    Returns:
        pandas.DataFrame :  Копия DataFrame с 
        добавленным признаком 'pre_since_opened_repeated_prop'.
    """
    
    print('FUNCTION from_pre_since_opened_create_pre_since_opened_sum_mean_repeated ')

    df = df.copy()
    
    # Считаем количество каждого значения 'pre_since_opened' для каждого 'id'
    counts = df.groupby(['id', 'pre_since_opened']).size()
    
    """
    Оставляем только повторяющиеся значения (количество > 1), 
    вычитаем первое появление.
    """
    repeated_pre_since_opened = counts[counts > 1] - 1

    # Суммируем количество повторов по каждому 'id'
    sum_repeated = repeated_pre_since_opened.groupby('id').sum()
    
    # Добавляем отсутствующие 'id' с нулевыми значениями повторов
    all_sum_repeated = sum_repeated.reindex(df['id'].unique(), fill_value=0)
    
    # Переименовываем Series для дальнейшего слияния
    all_sum_repeated = all_sum_repeated.rename('pre_since_opened_repeated_prop')

    # Объединяем с исходным DataFrame по 'id'
    df = df.merge(all_sum_repeated, on='id', how='left')

    # Нормируем сумму повторов на количество записей 'rn_max' для каждого 'id'
    df['pre_since_opened_repeated_prop'] = (
        df['pre_since_opened_repeated_prop'] / df['rn_max']
    )

    return df

def drop_columns_drop_duplicates_pipeline(
    df: pd.DataFrame
) -> pd.DataFrame:
    """
    Удаляет исходные и временные признаки из DataFrame,
    а также удаляет дубликаты по столбцу 'id', оставляя только первую запись.
    После удаления дубликатов столбец 'id' также удаляется.

    Args:
        df : Исходный DataFrame.

    Returns:
        pd.DataFrame : Копия DataFrame без указанных столбцов и дубликатов по 'id'.
    """

    print('FUNCTION drop_columns_drop_duplicates ')
    # Список столбцов на удаление
    columns = [
        'rn',
        'pre_since_opened',
        'pre_since_confirmed',
        'pre_pterm',
        'pre_fterm',
        'pre_till_pclose',
        'pre_till_fclose',
        'pre_loans_credit_limit',
        'pre_loans_next_pay_summ',
        'pre_loans_outstanding',
        'pre_loans_max_overdue_sum',
        'pre_loans_credit_cost_rate',
        'pre_loans5',
        'pre_loans530',
        'is_zero_loans5',
        'is_zero_loans530',
        'pre_util',
        'pre_over2limit',
        'is_zero_over2limit',
        'enc_paym_0',
        'enc_paym_1',
        'enc_paym_2',
        'enc_paym_8',
        'enc_paym_9',
        'enc_paym_10',
        'enc_paym_24',
        'enc_loans_account_holder_type',
        'enc_loans_credit_status',
        'enc_loans_credit_type',
        'enc_loans_account_cur',
        'is_zero_loans3060',
        'is_zero_loans6090',
        'is_zero_loans90',
        'enc_paym_3',
        'enc_paym_4',
        'enc_paym_5',
        'enc_paym_6',
        'enc_paym_7',
        'enc_paym_11',
        'enc_paym_12',
        'enc_paym_13',
        'enc_paym_14',
        'enc_paym_15',
        'enc_paym_16',
        'enc_paym_17',
        'enc_paym_18',
        'enc_paym_19',
        'enc_paym_20',
        'enc_paym_21',
        'enc_paym_22',
        'enc_paym_23',
        'enc_paym_avg_1_all',
        'enc_paym_avg_2_all',
        'enc_paym_avg_0_this_year',
        'enc_paym_avg_1_this_year',
        'enc_paym_avg_0_last_year',
        'is_zero_loans3060_prop_1',
        'is_zero_loans6090_prop_1',
        'is_zero_loans90_prop_1'
    ]
    df = df.copy()
    
    df = df.drop(columns, axis=1)
    
    """
    Удаляем дубликаты по столбцу 'id', оставляя первую запись
    и сбрасываем индекс.
    """
    df = df.drop_duplicates(subset=['id'], keep='first').reset_index(drop=True)
    
    # Удаляем столбец 'id', так как он больше не нужен
    df = df.drop('id', axis=1)
    
    return df

In [89]:
# Создаём SimpleImputer и настраиваем вывод в pandas DataFrame
imputer = SimpleImputer(strategy='median')
imputer.set_output(transform='pandas')

# Создаём паплайн препроцессинга
preprocessing_pipe = Pipeline([
    ('to_numeric', FunctionTransformer(convert_all_to_numeric)),
    ('imputer', imputer),
    ('to_int', FunctionTransformer(lambda df: df.astype(int), validate=False)),
    ('drop_duplicates', FunctionTransformer(lambda df: df.drop_duplicates(), validate=False))
])

# Создаём основной пайплайн
main_pipe = Pipeline(
    [
        (
            'preprocessing',
            preprocessing_pipe
        ),
        (
            'create_rn_max_feature',
            FunctionTransformer(create_rn_max_feature_pipeline)
        ),
        (
            'enc_paym_transcoding', 
            FunctionTransformer(enc_paym_transcoding_pipeline)
        ),
        (
            'create_definite_value_proportion_features',
            FunctionTransformer(create_definite_value_proportion_features_pipeline)
        ),
        (
            'create_sum_prop_1_feature',
            FunctionTransformer(from_is_zero_prop_1_create_sum_prop_1_feature_pipeline)
        ),
        (
            'create_mean_value_frequency_feature',
            FunctionTransformer(create_mean_value_frequency_feature_pipeline)
        ),
        (
            'from_enc_paym_create_normalized_group_sum_features_then_diff_features',
            FunctionTransformer(enc_paym_norm_group_sum_diff_pipeline)
        ),
        (
            'from_pre_since_opened_create_pre_since_opened_sum_mean_repeated',
            FunctionTransformer(pre_since_opened_sum_mean_repeated_pipeline)
        ),
        (
            'drop_temporary_and_source_columns_drop_duplicates',
            FunctionTransformer(drop_columns_drop_duplicates_pipeline)
        ),
    ]
)

In [90]:
# Трансформируем весь test датасет
X_test_full = main_pipe.fit_transform(X_test_full)
X_test_full

FUNCTION create_rn_max_feature 
FUNCTION enc_paym_transcoding 
FUNCTION create_definite_value_proportion_features 
Исходный признак enc_loans_account_holder_type
Новые фичи
enc_loans_account_holder_type_prop_4
Исходный признак pre_pterm
Новые фичи
pre_pterm_prop_6
pre_pterm_prop_3
Исходный признак is_zero_loans530
Новые фичи
is_zero_loans530_prop_1
Исходный признак enc_paym_0
Новые фичи
enc_paym_0_prop_1
Исходный признак pre_loans_credit_cost_rate
Новые фичи
pre_loans_credit_cost_rate_prop_6
pre_loans_credit_cost_rate_prop_11
pre_loans_credit_cost_rate_prop_4
Исходный признак pre_loans_next_pay_summ
Новые фичи
pre_loans_next_pay_summ_prop_5
pre_loans_next_pay_summ_prop_0
Исходный признак is_zero_over2limit
Новые фичи
is_zero_over2limit_prop_1
Исходный признак pre_loans_outstanding
Новые фичи
pre_loans_outstanding_prop_1
pre_loans_outstanding_prop_5
Исходный признак pre_util
Новые фичи
pre_util_prop_3
pre_util_prop_6
Исходный признак pre_till_pclose
Новые фичи
pre_till_pclose_prop_10
pr

Unnamed: 0,rn_max,enc_loans_account_holder_type_prop_4,pre_pterm_prop_6,pre_pterm_prop_3,is_zero_loans530_prop_1,enc_paym_0_prop_1,pre_loans_credit_cost_rate_prop_6,pre_loans_credit_cost_rate_prop_11,pre_loans_credit_cost_rate_prop_4,pre_loans_next_pay_summ_prop_5,pre_loans_next_pay_summ_prop_0,is_zero_over2limit_prop_1,pre_loans_outstanding_prop_1,pre_loans_outstanding_prop_5,pre_util_prop_3,pre_util_prop_6,pre_till_pclose_prop_10,pre_till_pclose_prop_7,is_zero_loans5_prop_1,pre_since_confirmed_prop_4,pre_since_confirmed_prop_7,pre_loans_credit_limit_prop_2,pre_loans_credit_limit_prop_15,pre_loans_credit_limit_prop_18,pre_over2limit_prop_17,pre_till_fclose_prop_4,pre_till_fclose_prop_3,pre_till_fclose_prop_1,enc_loans_credit_status_prop_5,pre_since_opened_prop_12,pre_since_opened_prop_8,pre_since_opened_prop_19,enc_paym_24_prop_1,pre_loans_max_overdue_sum_prop_1,enc_loans_credit_type_prop_0,enc_loans_credit_type_prop_2,pre_fterm_prop_7,pre_fterm_prop_3,is_zero_sum_prop_1,pre_util_mean_freq,pre_loans_credit_limit_mean_freq,pre_since_opened_mean_freq,pre_loans_credit_cost_rate_mean_freq,enc_loans_credit_type_mean_freq,pre_loans_next_pay_summ_mean_freq,pre_since_confirmed_mean_freq,pre_pterm_mean_freq,enc_paym_0_mean_freq,enc_loans_account_holder_type_mean_freq,pre_loans530_mean_freq,enc_paym_8_mean_freq,pre_loans5_mean_freq,enc_paym_10_mean_freq,enc_loans_account_cur_mean_freq,enc_paym_9_mean_freq,enc_paym_avg_0_1_this_year_diff,enc_paym_avg_1_2_all_diff,enc_paym_avg_0_years_diff,pre_since_opened_repeated_prop
0,10,0.0,0.200000,0.000000,1.000000,0.000000,0.000000,0.000000,0.600000,0.100000,0.100000,0.800000,0.100000,0.200000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.200000,0.200000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.200000,0.1,0.100000,0.000000,1.000000,0.373551,0.047791,0.052505,0.180491,0.322699,0.491939,0.064709,0.068872,0.922156,0.986421,0.976465,0.519372,0.994481,0.460134,0.997625,0.492167,11.000000,0.000000,3.000000,0.200000
1,8,0.0,0.000000,0.000000,0.875000,0.125000,0.000000,0.000000,0.125000,0.000000,0.000000,0.875000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.875000,0.250000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.125000,0.000000,0.925000,0.542754,0.051571,0.053100,0.175146,0.396390,0.679525,0.053202,0.071902,0.810980,0.986421,0.976465,0.358095,0.994481,0.484894,0.997625,0.483197,5.625000,1.000000,3.125000,0.250000
2,24,0.0,0.041667,0.000000,0.833333,0.000000,0.000000,0.041667,0.041667,0.000000,0.083333,0.916667,0.041667,0.041667,0.000000,0.041667,0.083333,0.000000,0.958333,0.000000,0.041667,0.041667,0.000000,0.041667,0.0,0.041667,0.125000,0.000000,0.000000,0.083333,0.041667,0.000000,0.0,0.000000,0.041667,0.0,0.000000,0.041667,0.958333,0.512353,0.056079,0.048681,0.135509,0.486299,0.496827,0.055511,0.063760,0.922156,0.986421,0.976465,0.497436,0.994481,0.484894,0.997625,0.484131,8.625000,0.250000,5.458333,0.416667
3,10,0.0,0.100000,0.100000,0.900000,0.000000,0.000000,0.000000,0.900000,0.000000,0.100000,1.000000,0.000000,0.300000,0.200000,0.000000,0.100000,0.000000,1.000000,0.300000,0.000000,0.000000,0.200000,0.000000,0.0,0.000000,0.100000,0.000000,0.000000,0.100000,0.000000,0.100000,0.0,0.000000,0.000000,0.3,0.200000,0.100000,0.980000,0.435629,0.051427,0.050433,0.265330,0.348531,0.429022,0.064044,0.086874,0.922156,0.986421,0.976465,0.481767,0.994481,0.484894,0.997625,0.483197,7.500000,0.100000,2.600000,0.100000
4,18,0.0,0.000000,0.166667,0.888889,0.055556,0.277778,0.000000,0.111111,0.000000,0.055556,0.944444,0.055556,0.000000,0.055556,0.000000,0.055556,0.055556,0.944444,0.000000,0.000000,0.000000,0.000000,0.277778,0.0,0.055556,0.000000,0.000000,0.000000,0.000000,0.000000,0.055556,0.0,0.000000,0.000000,0.0,0.055556,0.277778,0.944444,0.598533,0.044108,0.047689,0.085878,0.418242,0.505347,0.077908,0.084442,0.872744,0.986421,0.976465,0.416355,0.994481,0.498649,0.997625,0.478213,4.833333,0.333333,4.388889,0.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
599995,6,0.0,0.000000,0.000000,0.500000,0.166667,0.000000,0.166667,0.000000,0.166667,0.166667,0.833333,0.166667,0.000000,0.000000,0.000000,0.333333,0.000000,1.000000,0.000000,0.000000,0.166667,0.166667,0.000000,0.0,0.166667,0.000000,0.000000,0.166667,0.000000,0.000000,0.000000,0.0,0.166667,0.000000,0.0,0.166667,0.000000,0.866667,0.479132,0.062465,0.049799,0.045086,0.519829,0.471400,0.050808,0.051435,0.620631,0.986421,0.816428,0.414988,0.994481,0.424269,0.997625,0.483197,5.666667,1.333333,6.000000,0.166667
599996,3,0.0,0.000000,0.000000,0.666667,0.000000,0.000000,0.333333,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.333333,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.933333,0.243452,0.051348,0.048371,0.110287,0.564650,0.261021,0.064779,0.058170,0.922156,0.986421,0.976465,0.497436,0.994481,0.474577,0.997625,0.486934,8.000000,0.333333,2.000000,0.000000
599997,5,0.0,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.200000,0.000000,1.000000,0.200000,0.000000,0.000000,0.000000,0.200000,0.000000,1.000000,0.000000,0.000000,0.000000,0.200000,0.000000,0.0,0.200000,0.000000,0.000000,0.000000,0.200000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.200000,0.000000,1.000000,0.435546,0.052783,0.050754,0.089049,0.346523,0.429290,0.076531,0.063155,0.922156,0.986421,0.976465,0.453562,0.994481,0.503464,0.997625,0.476469,5.600000,0.000000,3.000000,0.400000
599998,7,0.0,0.285714,0.000000,0.857143,0.000000,0.000000,0.142857,0.000000,0.285714,0.000000,1.000000,0.285714,0.000000,0.000000,0.000000,0.142857,0.000000,1.000000,0.285714,0.142857,0.142857,0.142857,0.000000,0.0,0.285714,0.142857,0.142857,0.000000,0.142857,0.000000,0.000000,0.0,0.000000,0.142857,0.0,0.000000,0.000000,0.971429,0.413255,0.055032,0.050102,0.184725,0.372852,0.411566,0.079097,0.073273,0.922156,0.846662,0.976465,0.475051,0.994481,0.489315,0.997625,0.481595,7.571429,0.000000,3.142857,0.285714


git commit -m 'Create pipeline functions and drop_list' -m 'Create pre_since_opened_sum_mean_repeated_pipeline function - Create  drop_columns_drop_duplicates_pipeline function - Create drop_list for drop_columns_drop_duplicates_pipeline function'