In [1]:
%load_ext autoreload
%autoreload 2

import gc
import warnings

import scipy as sp
import numpy as np
import pandas as pd
import pickle
from collections import defaultdict
import random
import os

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer
from catboost import CatBoostClassifier
from sklearn.decomposition import PCA

import sys
sys.path.append("..")
from src import *

warnings.filterwarnings('ignore')



In [2]:
class CFG:
    n_folds = 5
    
    only_one_iter = True
    
    feature_selection_step1 = True
    
    model_params = {
        'iterations': 1000000,
        'learning_rate': 0.03,
    #     "l2_leaf_reg": 3, 
    #     "random_strength": 1,
    #     "bagging_temperature": 1,
    #     "max_depth": 6,
    #     "one_hot_max_size": 2, 
#         "rsm": 1,
        'loss_function': "Logloss",
        'use_best_model': True,
        "eval_metric": "Logloss",
        "early_stopping_rounds": 1000,
        'task_type': 'GPU'
    }
    
    
    seed = SEED
    seed_list = [111]
    
seed_everything(CFG.seed)

In [3]:
features = AmexFeatures()

## Donwload 

In [4]:
%%time
train = pd.read_parquet("../tmp/train_agg.parquet")
test = pd.read_parquet("../tmp/test_agg.parquet")
print(train.shape)

(458913, 4448)
CPU times: user 35.3 s, sys: 38 s, total: 1min 13s
Wall time: 36.6 s


In [5]:
if CFG.feature_selection_step1:
    with open("../tmp/selected_features_step1.pickle", "br") as f:
        cols = pickle.load(f)
    train = train[["customer_ID", "target"] + cols]
    test = test[["customer_ID"] + cols]
print(train.shape)

(458913, 2765)


In [6]:
train["fold"] = np.random.randint(low=0, high=CFG.n_folds, size=train.shape[0])

In [7]:
cat_features = features.get_categorical(train)
train[cat_features] = train[cat_features].astype(np.int8)
test[cat_features] = test[cat_features].astype(np.int8)

In [8]:
prediction = np.zeros(test.shape[0])
amex_metric_score = 0
logloss_score = 0
iterations = 0

for fold in range(0, CFG.n_folds):
    if CFG.only_one_iter and fold != 0:
        break
    print(f" Fold {fold} ".center(80, "="))
    
    X_train, X_val = train[train["fold"] != fold], train[train["fold"] == fold]
    
    y_train = X_train["target"]
    X_train = X_train.drop(["customer_ID", "target", "fold"], axis=1)

    y_val = X_val["target"]
    X_val = X_val.drop(["customer_ID", "target", "fold"], axis=1)
    
    for seed in CFG.seed_list:
        print(f" Seed {seed} ".center(80, "-"))
        
        cb = CatBoostClassifier(**CFG.model_params, 
                                random_seed=seed)
        cb.fit(
            X = X_train,
            y = y_train,
            eval_set=(X_val, y_val),
            cat_features=cat_features,
            verbose=1000
        )

        cb.save_model(f"../models/cb_fold_{fold}_seed_{seed}.cb_model")

        amex_metric_fold = amex_metric(y_val, cb.predict_proba(X_val)[:, 1])
        amex_metric_score += amex_metric_fold
        logloss_fold = cb.best_score_['validation']['Logloss']
        logloss_score += logloss_fold
        print(f"Fold {fold}, seed: {seed}: amex_metric {amex_metric_fold}, logloss {logloss_fold}")

        pred_fold = cb.predict_proba(test.drop(["customer_ID"], axis=1))[:, 1]
        prediction += pred_fold

        iterations += 1
        
        print()
    
    
amex_metric_score /= iterations
prediction /= iterations
logloss_score /= iterations

print(f"MEAN AMEX METRIC: {amex_metric_score}, LOGLOSS: {logloss_score}")
telegram(f"Train model end: MEAN AMEX METRIC: {amex_metric_score}, LOGLOSS: {logloss_score}")

----------------------------------- Seed 111 -----------------------------------
0:	learn: 0.6562541	test: 0.6563657	best: 0.6563657 (0)	total: 109ms	remaining: 1d 6h 16m 12s
1000:	learn: 0.2133631	test: 0.2214788	best: 0.2214788 (1000)	total: 1m 29s	remaining: 1d 46m 2s
2000:	learn: 0.2033058	test: 0.2199123	best: 0.2199123 (2000)	total: 2m 57s	remaining: 1d 31m 23s
3000:	learn: 0.1945715	test: 0.2192444	best: 0.2192444 (3000)	total: 4m 25s	remaining: 1d 28m 11s
4000:	learn: 0.1866683	test: 0.2188061	best: 0.2188051 (3997)	total: 5m 53s	remaining: 1d 26m 28s
5000:	learn: 0.1793934	test: 0.2186356	best: 0.2186194 (4881)	total: 7m 21s	remaining: 1d 24m 19s
6000:	learn: 0.1725556	test: 0.2184451	best: 0.2184374 (5995)	total: 8m 49s	remaining: 1d 21m
7000:	learn: 0.1661601	test: 0.2183360	best: 0.2183327 (6985)	total: 10m 17s	remaining: 1d 19m 13s
8000:	learn: 0.1601399	test: 0.2183020	best: 0.2182863 (7865)	total: 11m 45s	remaining: 1d 18m 10s
bestTest = 0.2182862696
bestIteration = 7865

In [9]:
agg_imp = defaultdict(int)
feature_imp = defaultdict(int)

drop_features = []

sorted_feature_imp = sorted(zip(cb.feature_importances_, cb.feature_names_), reverse=True)
min_imp = sorted_feature_imp[100][0] * 0.1

for imp, name in sorted_feature_imp:
    agg_imp["_".join(name.split("_")[2:-1])] += imp
    feature_imp["_".join(name.split("_")[:2])] += imp
    print(name, imp)
    if imp < min_imp:
        drop_features.append(name)

P_2_last_3m 8.687045286053307
B_1_last_3m 3.7899465648942208
S_3_mean_13m 2.073367247586612
B_9_last_3m 1.6468220814316394
D_48_last_3m 1.4768782042773598
B_2_last_3m 1.4267359463129523
D_39_last_3m 1.3729682772642087
R_3_mean_13m 1.3504714114443415
D_44_last_3m 1.294781069185904
S_3_last_3m 1.1304371071343529
D_51_last_3m 0.8577964458884126
B_4_last_3m 0.8300364103257281
B_8_first_6m 0.8002885117273382
D_45_last_3m 0.7572487026477797
R_1_mean_3m 0.7539765201636645
D_47_last_3m 0.6727418886164435
B_4_last_first_diff_13m 0.6561796103961833
B_3_last_3m 0.5870526736918987
B_4_last_mean_diff_13m 0.5420905398268688
D_79_last_3m 0.5365482436651273
R_1_mean_13m 0.5298927766319265
B_3_last_first_diff_13m 0.4725132501251758
R_1_last_3m 0.46085040632870544
D_43_last_3m 0.4460035425058905
D_50_last_std_diff_6m 0.44299275641764374
B_10_mean_3m 0.4342124304240024
D_56_last_3m 0.42156481077293545
B_18_last_3m 0.42032807204858413
D_56_first_13m 0.407577112027594
D_39_mean_3m 0.406850366245888
D_50_me

In [10]:
len(drop_features)

941

In [11]:
with open("../tmp/selected_features_step2.pickle", "bw") as f:
    pickle.dump(drop_features, f)

In [12]:
sorted(list(agg_imp.items()), key=lambda x: -x[1])

[('last', 33.85711051994908),
 ('mean', 12.39462485303898),
 ('first', 11.378914004183184),
 ('std', 8.655176663146442),
 ('last_first_diff', 7.619071824552389),
 ('last_std_ratio', 7.339633920734294),
 ('last_mean_diff', 6.193308522960362),
 ('last_first_ratio', 4.700660767725357),
 ('last_mean_ratio', 4.385507085548708),
 ('last_std_diff', 3.4759918381612773)]

In [13]:
sorted(list(feature_imp.items()), key=lambda x: -x[1])

[('P_2', 9.806891566939091),
 ('B_1', 4.452791675198407),
 ('S_3', 3.575325173650572),
 ('D_39', 3.3634667074020235),
 ('B_4', 2.908811126519955),
 ('R_1', 2.6239493448074978),
 ('B_9', 2.6052284060144704),
 ('D_44', 2.4540328458209246),
 ('D_48', 2.333654400152248),
 ('B_3', 2.1914581892280864),
 ('B_2', 2.1622765839626994),
 ('R_3', 2.0009730488454287),
 ('D_50', 1.6672980247452065),
 ('R_2', 1.483225331911573),
 ('D_41', 1.3984870233067332),
 ('D_51', 1.3973704931695161),
 ('D_43', 1.3120865836287783),
 ('B_8', 1.1642382901989758),
 ('D_79', 1.1570856626819102),
 ('D_45', 1.1119638044479303),
 ('B_23', 1.0737105312903146),
 ('B_6', 1.0715277092238444),
 ('D_47', 1.0380029318637223),
 ('B_10', 1.035309008416283),
 ('D_65', 0.9593557545684349),
 ('D_56', 0.8970490618913017),
 ('B_5', 0.8730665257148971),
 ('S_15', 0.8458430776662866),
 ('B_18', 0.8268164046910526),
 ('D_52', 0.820746192176459),
 ('B_22', 0.8053172044486971),
 ('R_27', 0.7877580538576654),
 ('B_40', 0.7629283840358579)

In [14]:
test["prediction"] = prediction
submit_amex(test, 
                model="cb",
                n_cols=train.shape[1], 
                task_type=CFG.model_params['task_type'], 
                n_folds=CFG.n_folds, 
                n_seeds=len(CFG.seed_list),
                n_iter=iterations, 
                logloss_score=str(logloss_score).replace('0.', '')[:4], 
                amex_score=str(amex_metric_score).replace('0.', '')[:4],
                submit_on_kaggle=True)

b'Successfully submitted to American Express - Default Prediction'
