In [1]:
%load_ext autoreload
%autoreload 2

import gc
import warnings

import scipy as sp
import numpy as np
import pandas as pd
import pickle
from collections import defaultdict
import random
import os

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer
from catboost import CatBoostClassifier
from sklearn.decomposition import PCA

import sys
sys.path.append("..")
from src import *

warnings.filterwarnings('ignore')



In [8]:
class CFG:
    n_folds = 5
    
    only_one_iter = False
    
    feature_selection_step1 = True
    
    model_params = {
        'iterations': 1000000,
        'learning_rate': 0.02,
    #     "l2_leaf_reg": 3, 
    #     "random_strength": 1,
    #     "bagging_temperature": 1,
    #     "max_depth": 6,
    #     "one_hot_max_size": 2, 
#         "rsm": 1,
        'loss_function': "Logloss",
        'use_best_model': True,
        "eval_metric": "Logloss",
        "early_stopping_rounds": 3000,
        'task_type': 'GPU'
    }
    
    
    seed = SEED
    seed_list = [111]
    
seed_everything(CFG.seed)

In [3]:
features = AmexFeatures()

## Donwload 

In [4]:
%%time
train = pd.read_parquet("../tmp/train_agg.parquet")
print(train.shape)

(458913, 5732)
CPU times: user 16.3 s, sys: 12.8 s, total: 29.1 s
Wall time: 15.5 s


In [5]:
if CFG.feature_selection_step1:
    with open("../tmp/selected_features_step1.pickle", "br") as f:
        cols = pickle.load(f)
    train = train[["customer_ID", "target"] + cols]
print(train.shape)

(458913, 3550)


In [6]:
train["fold"] = np.random.randint(low=0, high=CFG.n_folds, size=train.shape[0])

In [7]:
cat_features = features.get_categorical(train)
train[cat_features] = train[cat_features].astype(np.int8)

In [9]:
amex_metric_score = 0
logloss_score = 0
iterations = 0

global_features_imp = defaultdict(float)

for fold in range(0, CFG.n_folds):
    if CFG.only_one_iter and fold != 0:
        break
    print(f" Fold {fold} ".center(80, "="))
    
    X_train, X_val = train[train["fold"] != fold], train[train["fold"] == fold]
    
    y_train = X_train["target"]
    X_train = X_train.drop(["customer_ID", "target", "fold"], axis=1)

    y_val = X_val["target"]
    X_val = X_val.drop(["customer_ID", "target", "fold"], axis=1)
    
    for seed in CFG.seed_list:
        print(f" Seed {seed} ".center(80, "-"))
        
        cb = CatBoostClassifier(**CFG.model_params, 
                                random_seed=seed)
        cb.fit(
            X = X_train,
            y = y_train,
            eval_set=(X_val, y_val),
            cat_features=cat_features,
            verbose=1000
        )

        cb.save_model(f"../models/cb_fold_{fold}_seed_{seed}.cb_model")

        for name, imp in zip(cb.feature_names_, cb.feature_importances_):
            global_features_imp[name] += imp
        
        amex_metric_fold = amex_metric(y_val, cb.predict_proba(X_val)[:, 1])
        amex_metric_score += amex_metric_fold
        
        logloss_fold = cb.best_score_['validation']['Logloss']
        logloss_score += logloss_fold
        
        print(f"Fold {fold}, seed: {seed}: amex_metric {amex_metric_fold}, logloss {logloss_fold}")

        iterations += 1
        
        print()
    
    
amex_metric_score /= iterations
logloss_score /= iterations

print(f"MEAN AMEX METRIC: {amex_metric_score}, LOGLOSS: {logloss_score}")
telegram(f"Train model end: MEAN AMEX METRIC: {amex_metric_score}, LOGLOSS: {logloss_score}")

----------------------------------- Seed 111 -----------------------------------
0:	learn: 0.6681754	test: 0.6682246	best: 0.6682246 (0)	total: 131ms	remaining: 1d 12h 19m 53s
1000:	learn: 0.2183748	test: 0.2227034	best: 0.2227034 (1000)	total: 1m 44s	remaining: 1d 4h 50m 35s
2000:	learn: 0.2104730	test: 0.2206231	best: 0.2206224 (1999)	total: 3m 26s	remaining: 1d 4h 33m 59s
3000:	learn: 0.2041027	test: 0.2198546	best: 0.2198546 (3000)	total: 5m 7s	remaining: 1d 4h 20m 43s
4000:	learn: 0.1982560	test: 0.2194606	best: 0.2194606 (4000)	total: 6m 47s	remaining: 1d 4h 11m 57s
5000:	learn: 0.1927654	test: 0.2191232	best: 0.2191232 (5000)	total: 8m 28s	remaining: 1d 4h 6m 13s
6000:	learn: 0.1876167	test: 0.2188690	best: 0.2188591 (5977)	total: 10m 8s	remaining: 1d 4h 1m 1s
7000:	learn: 0.1826458	test: 0.2187106	best: 0.2186977 (6930)	total: 11m 49s	remaining: 1d 3h 57m 26s
8000:	learn: 0.1779090	test: 0.2185944	best: 0.2185900 (7895)	total: 13m 30s	remaining: 1d 3h 53m 54s
9000:	learn: 0.173

16000:	learn: 0.1452952	test: 0.2219020	best: 0.2218145 (13463)	total: 26m 57s	remaining: 1d 3h 37m 38s
bestTest = 0.2218144737
bestIteration = 13463
Shrink model to first 13464 iterations.
Fold 3, seed: 111: amex_metric 0.7851456401524899, logloss 0.22181447372015517

----------------------------------- Seed 111 -----------------------------------
0:	learn: 0.6681439	test: 0.6680412	best: 0.6680412 (0)	total: 122ms	remaining: 1d 9h 59m 10s
1000:	learn: 0.2183417	test: 0.2224605	best: 0.2224605 (1000)	total: 1m 42s	remaining: 1d 4h 28m 24s
2000:	learn: 0.2104919	test: 0.2204898	best: 0.2204895 (1999)	total: 3m 24s	remaining: 1d 4h 16m 14s
3000:	learn: 0.2041282	test: 0.2196913	best: 0.2196904 (2998)	total: 5m 5s	remaining: 1d 4h 10m 13s
4000:	learn: 0.1982471	test: 0.2192200	best: 0.2192200 (4000)	total: 6m 46s	remaining: 1d 4h 5m 20s
5000:	learn: 0.1927380	test: 0.2189031	best: 0.2189019 (4999)	total: 8m 27s	remaining: 1d 4h 1m 43s
6000:	learn: 0.1876036	test: 0.2187135	best: 0.218708

In [14]:
selected_cols = list(map(lambda x: x[0], sorted(global_features_imp.items(), key=lambda x: -x[1])))[:2000]

In [15]:
with open("../tmp/selected_features_step2.pickle", "bw") as f:
    pickle.dump(selected_cols, f)

In [16]:
agg_imp = defaultdict(int)
feature_imp = defaultdict(int)

drop_features = []

sorted_feature_imp = sorted(zip(cb.feature_importances_, cb.feature_names_), reverse=True)
min_imp = sorted_feature_imp[100][0] * 0.1

for imp, name in sorted_feature_imp:
    agg_imp["_".join(name.split("_")[2:-1])] += imp
    feature_imp["_".join(name.split("_")[:2])] += imp
    print(name, imp)
    if imp < min_imp:
        drop_features.append(name)

P_2_last_2m 7.866237033536478
B_11_last_2m 3.4982804308637365
B_2_last_2m 1.551925710797712
S_3_last_2m 1.3250591032507786
D_44_last_2m 1.2662794526694048
B_9_last_2m 1.086813005290026
D_48_last_2m 0.9684415492532967
D_39_last_2m 0.9283397946274003
B_4_last_2m 0.8953784077444245
D_45_last_2m 0.8808057824985626
B_8_first_6m 0.8263281933722426
B_3_last_2m 0.7945274053782144
D_47_last_2m 0.7735298669415411
R_1_mean_6m 0.651556499825894
D_43_last_2m 0.6434029393967029
D_56_last_2m 0.615009429733254
D_51_max_6m 0.5979185837361081
S_3_min_6m 0.578074235204081
R_1_mean_2m 0.5771935561687759
D_39_std_6m 0.5431313908015584
B_23_last_2m 0.5412859843403077
R_3_mean_6m 0.5168114127517881
S_8_mean_6m 0.5098721126113883
D_39_mean_2m 0.49114235176009546
B_3_std_6m 0.4288699759094203
D_51_last_2m 0.4287755184815934
D_62_min_6m 0.41231597627187777
B_8_last_2m 0.4000260856789889
D_79_last_2m 0.3983100019026826
R_1_last_2m 0.3857924881794771
D_65_mean_6m 0.373158004822834
B_4_last_min_diff_6m 0.370559994

In [18]:
sorted(list(agg_imp.items()), key=lambda x: -x[1])

[('last', 32.57342467327522),
 ('first', 8.977635970852841),
 ('mean', 7.744013175112954),
 ('last_std_ratio', 7.302172492017076),
 ('std', 6.636441759281589),
 ('last_max_ratio', 5.095876622898924),
 ('min', 5.051310085607678),
 ('last_first_diff', 3.8043977472320853),
 ('last_min_diff', 3.554858349239746),
 ('last_first_ratio', 3.1538680636191687),
 ('last_std_diff', 3.0950123520454706),
 ('last_mean_ratio', 2.9368369206696867),
 ('max', 2.934591465826553),
 ('last_min_ratio', 2.535529054119669),
 ('last_max_diff', 2.269827043803019),
 ('last_mean_diff', 2.1413883676934495),
 ('median', 0.1928158567052909)]

In [20]:
sorted(list(feature_imp.items()), key=lambda x: -x[1])

[('P_2', 8.72589455607467),
 ('B_11', 4.15011613871617),
 ('D_39', 3.0763796001997594),
 ('S_3', 2.7889575935606006),
 ('B_2', 2.3122884604575913),
 ('B_4', 2.231851756202735),
 ('R_1', 2.2298171685081245),
 ('B_9', 2.2082218225129226),
 ('B_3', 2.1699236806439695),
 ('D_44', 1.9351500946302955),
 ('D_43', 1.7445689579472647),
 ('D_48', 1.7155681137150003),
 ('B_8', 1.6517221827894184),
 ('D_50', 1.4946765075623005),
 ('D_51', 1.367998285923387),
 ('R_2', 1.358867105643067),
 ('B_23', 1.3015798897681214),
 ('D_45', 1.2093117310085457),
 ('R_3', 1.1454530800258818),
 ('D_41', 1.120779364300511),
 ('D_47', 1.1168007532903337),
 ('S_8', 1.027860967451726),
 ('B_5', 0.9386914818285009),
 ('D_55', 0.916216340030048),
 ('D_79', 0.91376938447291),
 ('S_25', 0.8797233833364744),
 ('D_65', 0.8764647552263228),
 ('B_22', 0.8383177393202914),
 ('B_10', 0.8179401113151035),
 ('D_46', 0.8113379850717862),
 ('S_15', 0.7659995824340126),
 ('B_18', 0.763368266721969),
 ('B_6', 0.7514584968319562),
 ('