# Baseline


In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from tqdm import tqdm, trange
from tqdm.notebook import tqdm
# Включаем tqdm для pandas, чтобы можно было запускать progress_apply() вместо простого apply()
tqdm.pandas() 
# Устанавливаем тему по умолчанию
sb_dark = sns.dark_palette('skyblue', 8, reverse=True) # teal
sns.set(palette=sb_dark)

# Загрузка данных

In [3]:
PATH = ''
PATH_DATASET = PATH + 'datasets/'

file_pars_smp_train = PATH_DATASET + "train/pars_smp_train.npy"
file_y_smp_train = PATH_DATASET + "train/y_smp_train.npy"

file_random_submit = PATH_DATASET + "test/random_submit.npy"
file_y_smp_test = PATH_DATASET + "test/y_smp_test.npy"

file_output_submit = PATH_DATASET + "test/output_submit.npy"

In [4]:
pars_smp_train = np.load(file_pars_smp_train)
y_smp_train = np.load(file_y_smp_train)
random_submit = np.load(file_random_submit)
y_smp_test = np.load(file_y_smp_test)

pars_smp_train.shape, y_smp_train.shape, random_submit.shape, y_smp_test.shape

((1000000, 15, 1), (1000000, 200, 3), (100000, 15, 6), (100000, 200, 3))

In [5]:
y_smp_train = np.concatenate(
    [y_smp_train,
        (y_smp_train[:,:,0] - y_smp_train[:,:,1])[...,None],
        (y_smp_train[:,:,2] - y_smp_train[:,:,1])[...,None],
        (y_smp_train[:,:,0] - y_smp_train[:,:,2])[...,None],
    ], axis=2)
y_smp_train.shape

(1000000, 200, 6)

In [6]:
y_smp_test = np.concatenate(
    [y_smp_test,
        (y_smp_test[:,:,0] - y_smp_test[:,:,1])[...,None],
        (y_smp_test[:,:,2] - y_smp_test[:,:,1])[...,None],
        (y_smp_test[:,:,0] - y_smp_test[:,:,2])[...,None],
    ], axis=2)
y_smp_test.shape

(100000, 200, 6)

In [10]:
# Делаем reshape для модели (flatten)
reshape_pars_smp_train = np.reshape(pars_smp_train,(1_000_000, -1))
reshape_y_smp_train = np.reshape(y_smp_train,(1_000_000, -1))
reshape_y_smp_test = np.reshape(y_smp_test,(100_000, -1))

reshape_pars_smp_train.shape, reshape_y_smp_train.shape, reshape_y_smp_test.shape

((1000000, 15), (1000000, 1200), (100000, 1200))

# Формируем train/test/val

In [14]:
data_X_train = reshape_y_smp_train
data_y_train = reshape_pars_smp_train
X_train, X_val, y_train, y_val = train_test_split(reshape_y_smp_train, reshape_pars_smp_train, test_size=0.05, random_state=53)

X_train.shape, X_val.shape, y_train.shape, y_val.shape

((950000, 1320), (50000, 1320), (950000, 15), (50000, 15))

In [13]:
# # COUNT_ROW = 1_000_000
# COUNT_ROW = 400_000
# data_X_train = reshape_y_smp_train[:COUNT_ROW]
# data_y_train = reshape_pars_smp_train[:COUNT_ROW]
# X_train, X_val, y_train, y_val = train_test_split(data_X_train, data_y_train, test_size=0.2, random_state=53)
# X_test, X_val, y_test, y_val = train_test_split(X_val, y_val, test_size=0.5, random_state=53)

# X_train.shape, X_val.shape, X_test.shape, y_train.shape, y_val.shape, y_test.shape

((320000, 1200),
 (40000, 1200),
 (40000, 1200),
 (320000, 15),
 (40000, 15),
 (40000, 15))

# Модель

- Для бейзлайна будем пробовать 15 моделей бустинга, для каждого предсказываемого признака
- Попробовать разные лоссы

In [15]:
# Вывод графика feature importance
def plot_feature_importance(importance, names, model_name="", top_n=-1, skip_columns=[]):
    """Функция вывода feature importance
        :importance - массив важности фичей, полученный от модели
        :names - массив названий фичей
        :model_name - название модели
        :top_n - кол-во выводимых фичей
        :skip_columns: какие фичи пропустить, такое может понадобиться чтобы временно убрать 
                        из отображаемых горячие фичи, и изучить менее сильные
        :return - fi_df - feature importance датафрейм
    """
    feature_importance = np.array(importance)
    feature_names = np.array(names)
    
    data={'feature_names':feature_names,'feature_importance':feature_importance}
    fi_df = pd.DataFrame(data)
    fi_df = fi_df[~fi_df['feature_names'].isin(skip_columns)]
    fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)
    
    plt.figure(figsize=(10,8))
    sns.barplot(x=fi_df['feature_importance'][:top_n], y=fi_df['feature_names'][:top_n])
    if top_n != -1:
        plt.title(f"{model_name} FEATURE IMPORTANCE (Top: {top_n})")
    else:
        plt.title(f"{model_name} FEATURE IMPORTANCE")
    plt.xlabel('FEATURE IMPORTANCE')
    plt.ylabel('FEATURE NAMES')
    return fi_df

In [16]:
from catboost import CatBoostRegressor, Pool

In [17]:
%%time
iterations = 8000
models_dict = {}
dim_total = 15
for i in trange(dim_total):
    print(f"{i} model")
    models_dict[i] = CatBoostRegressor(eval_metric = "RMSE", iterations=iterations, early_stopping_rounds=200, random_state=53, task_type="GPU", devices='0:1')
    models_dict[i].fit(X_train,  y_train[:,i], eval_set=(X_val, y_val[:,i]), plot=True, verbose=False)

  0%|          | 0/15 [00:00<?, ?it/s]

0 model


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

  7%|▋         | 1/15 [14:40<3:25:27, 880.53s/it]

1 model


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

 13%|█▎        | 2/15 [29:23<3:11:02, 881.70s/it]

2 model


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

 20%|██        | 3/15 [44:08<2:56:42, 883.52s/it]

3 model


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

 27%|██▋       | 4/15 [58:51<2:41:55, 883.18s/it]

4 model


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

 33%|███▎      | 5/15 [1:12:43<2:24:06, 864.60s/it]

5 model


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

 40%|████      | 6/15 [1:27:20<2:10:20, 868.99s/it]

6 model


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

 47%|████▋     | 7/15 [1:40:31<1:52:27, 843.44s/it]

7 model


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

 53%|█████▎    | 8/15 [1:51:13<1:30:54, 779.23s/it]

8 model


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

 60%|██████    | 9/15 [2:04:10<1:17:52, 778.71s/it]

9 model


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

 67%|██████▋   | 10/15 [2:17:56<1:06:06, 793.29s/it]

10 model


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

 73%|███████▎  | 11/15 [2:31:42<53:33, 803.35s/it]  

11 model


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

 80%|████████  | 12/15 [2:46:20<41:17, 825.89s/it]

12 model


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

 87%|████████▋ | 13/15 [2:59:46<27:19, 820.00s/it]

13 model


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

 93%|█████████▎| 14/15 [3:12:42<13:26, 806.77s/it]

14 model


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

100%|██████████| 15/15 [3:25:37<00:00, 822.48s/it]

CPU times: total: 6h 14min 34s
Wall time: 3h 25min 37s





In [10]:
%%time
# iterations_result = int(iterations*0.01)
iterations_result = 50
print(f"iterations_result: {iterations_result}")
test_pool = Pool(reshape_y_smp_test)
test_predictions = np.empty((len(reshape_y_smp_test), len(models_dict), iterations_result))
test_predictions[:] = np.nan
print(test_predictions.shape)
for idx_model in tqdm(models_dict):
    # print(f"idx_model: {idx_model}")
    for epoch, predictions in enumerate(models_dict[idx_model].staged_predict(test_pool)):
        if epoch < (iterations- iterations_result):
            continue    
        epoch_idx = epoch - (iterations- iterations_result)
        test_predictions[:, idx_model, epoch_idx] = predictions
test_predictions.shape

iterations_result: 50
(100000, 15, 50)


  0%|          | 0/15 [00:00<?, ?it/s]

CPU times: total: 17min 9s
Wall time: 10min 10s


(100000, 15, 50)

In [11]:
assert np.count_nonzero(np.isnan(test_predictions[:,:,4])) == 0, "Есть NaN значения в первых значениях, увеличьте число обрабатываемых иттераций (iterations_result)"

In [12]:
test_predictions_agregate = np.empty((len(reshape_y_smp_test), len(models_dict), 6))
test_predictions_agregate[:] = np.nan
for idx_row in trange(len(reshape_y_smp_test)):
    for idx_model in range(len(models_dict)):
        obj = test_predictions[idx_row, idx_model]
        test_predictions_agregate[idx_row, idx_model] = [np.nanmean(obj), np.nanquantile(obj, 0.1), np.nanquantile(obj, 0.25), np.nanquantile(obj, 0.5), np.nanquantile(obj, 0.75), np.nanquantile(obj, 0.90)]
test_predictions_agregate.shape

100%|██████████| 100000/100000 [05:45<00:00, 289.84it/s]


(100000, 15, 6)

In [13]:
assert np.count_nonzero(np.isnan(test_predictions_agregate)) == 0, "Обнаружены NaN значения в итоговых данных, увеличьте число обрабатываемых иттераций (iterations_result)"

In [14]:
random_submit.shape, test_predictions_agregate.shape

((100000, 15, 6), (100000, 15, 6))

In [15]:
file_output_submit = PATH_DATASET + "test/output_submit_v4_2.npy"
np.save(file_output_submit, test_predictions_agregate)
