In [1]:
# !pip install kaggle
# !mkdir /root/.kaggle
# !cp /storage/kaggle.json /root/.kaggle/kaggle.json
# !chmod 600 /root/.kaggle/kaggle.json
# !kaggle datasets download -d raddar/amex-data-integer-dtypes-parquet-format -p /storage/data --unzip

In [2]:
!nvidia-smi

Mon Sep 12 03:54:46 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.73.05    Driver Version: 510.73.05    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Quadro RTX 5000     Off  | 00000000:00:05.0 Off |                  Off |
| 33%   26C    P8     3W / 230W |      0MiB / 16384MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
import os
os.listdir('/storage/data')

['train.parquet',
 '__notebook_source__.ipynb',
 'cust_type.parquet',
 'test.parquet',
 'train_labels.csv']

In [4]:
import pandas as pd, numpy as np 
pd.set_option('mode.chained_assignment', None)
import cupy, cudf
from numba import cuda
from cuml import ForestInference
import joblib
import time
import matplotlib.pyplot as plt
import pyarrow.parquet as pq
import random
from tqdm import tqdm
import pickle
import random
import gc

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score

import xgboost as xgb
print('XGB Version',xgb.__version__)

XGB Version 1.6.1


In [5]:
CWD = '/storage/data' # 将CWD换成你的数据目录
SEED = 42
NAN_VALUE = -127 # will fit in int8
FOLDS = 10
RAW_CAT_FEATURES = ["B_30","B_38","D_114","D_116","D_117","D_120","D_126","D_63","D_64","D_66","D_68"]

# Utils

In [6]:
def fast_auc(y_true, y_prob):
    y_true = cupy.asarray(y_true)
    y_true = y_true[cupy.argsort(y_prob)]
    cumfalses = cupy.cumsum(1-y_true)
    nfalse = cumfalses[-1]
    auc = (y_true * cumfalses).sum()
    auc = auc / (nfalse * (len(y_true) - nfalse))
    return auc

def eval_auc(preds, dtrain):
    labels = dtrain.get_label()
    return 'auc', fast_auc(labels, preds), True

# NEEDED WITH DeviceQuantileDMatrix BELOW
class IterLoadForDMatrix(xgb.core.DataIter):
    def __init__(self, df=None, features=None, target=None, batch_size=256*512):
        self.features = features
        self.target = target
        self.df = df
        self.it = 0 # set iterator to 0
        self.batch_size = batch_size
        self.batches = int( np.ceil( len(df) / self.batch_size ) )
        super().__init__()

    def reset(self):
        '''Reset the iterator'''
        self.it = 0

    def next(self, input_data):
        '''Yield next batch of data.'''
        if self.it == self.batches:
            return 0 # Return 0 when there's no more batch.
        
        a = self.it * self.batch_size
        b = min( (self.it + 1) * self.batch_size, len(self.df) )
        dt = cudf.DataFrame(self.df.iloc[a:b])
        input_data(data=dt[self.features], label=dt[self.target]) #, weight=dt['weight'])
        self.it += 1
        return 1

def train_xgb_params(params, train, val_folds, features, early_stopping_rounds, num_boost_round, verbose_eval, data_seed, model_seed, ver='model', save_dir='./'):
    importances = []
    oof = []
    skf = KFold(n_splits=FOLDS, shuffle=True, random_state=data_seed)

    for fold in val_folds:
        Xy_train = IterLoadForDMatrix(train.loc[train['fold'] != fold], features, 'target')
        X_valid = train.loc[train['fold'] == fold, features]
        y_valid = train.loc[train['fold'] == fold, 'target']

        dtrain = xgb.DeviceQuantileDMatrix(Xy_train, max_bin=256)
        dvalid = xgb.DMatrix(data=X_valid, label=y_valid)

        # TRAIN MODEL FOLD K
        # XGB MODEL PARAMETERS
        xgb_parms = { 
            'eval_metric':'auc',
            'objective':'binary:logistic',
            'tree_method':'gpu_hist',
            'predictor':'gpu_predictor',
            'random_state':model_seed,
        }    
        xgb_parms.update(params)
        model = xgb.train(xgb_parms, 
                    dtrain=dtrain,
                    evals=[(dtrain,'train'), (dvalid,'valid')],
                    num_boost_round=num_boost_round, 
                    early_stopping_rounds=early_stopping_rounds,
                    verbose_eval=verbose_eval,
                    ) 
        best_iter = model.best_iteration
        if save_dir:
            model.save_model(os.path.join(save_dir, f'{ver}_fold{fold}_{best_iter}.xgb'))

        # GET FEATURE IMPORTANCE FOR FOLD K
        dd = model.get_score(importance_type='weight')
        df = pd.DataFrame({'feature':dd.keys(),f'importance_{fold}':dd.values()})
        df = df.set_index('feature')
        importances.append(df)        
        
        # INFER OOF FOLD K
        oof_preds = model.predict(dvalid, iteration_range=(0,model.best_iteration+1))
        if verbose_eval:
            acc = fast_auc(y_valid.values, oof_preds)
            print(f'acc_{fold}', acc)

        # SAVE OOF
        df = train.loc[train['fold'] == fold, ['customer_ID','target'] ].copy()
        df['oof_pred'] = oof_preds
        oof.append( df )

        del Xy_train, df, X_valid, y_valid, dtrain, dvalid, model
        _ = gc.collect()

    importances = pd.concat(importances, axis=1)
    oof = cudf.concat(oof, axis=0, ignore_index=True).set_index('customer_ID')
    acc = fast_auc(oof.target.values, oof.oof_pred.values)
    return acc, oof, importances

def clear_file(data_dir):
    files = [c for c in os.listdir(data_dir) if '.ipynb_checkpoints' not in c]
    for file in files:
        os.remove(os.path.join(data_dir, file))

# Split Data

In [7]:
train = cudf.read_csv(os.path.join(CWD, 'train_labels.csv'))
train['customer_ID'] = train['customer_ID'].str[-16:].str.hex_to_int().astype('int64')

train_cust = train[['customer_ID']]
CUST_SPLITS = train_cust.sample(frac=0.2, random_state=SEED)
CUST_SPLITS['fold'] = 9999

train_cv_cust = train_cust[~train_cust['customer_ID'].isin(CUST_SPLITS['customer_ID'].values)].reset_index(drop=True)
skf = KFold(n_splits=FOLDS, shuffle=True, random_state=SEED)
for fold,(train_idx, valid_idx) in enumerate(skf.split(train_cv_cust)):
    df = train_cv_cust.loc[valid_idx, ['customer_ID']]
    df['fold'] = fold
    CUST_SPLITS = cudf.concat([CUST_SPLITS, df])
CUST_SPLITS = CUST_SPLITS.set_index('customer_ID')
CUST_SPLITS.to_csv('CUST_SPLITS.csv')

del train, train_cust
_ = gc.collect()

CUST_SPLITS['fold'].value_counts()

# Data

In [8]:
def read_file(path = '', usecols = None):
    # LOAD DATAFRAME
    if usecols is not None: df = cudf.read_parquet(path, columns=usecols)
    else: df = cudf.read_parquet(path)
    # REDUCE DTYPE FOR CUSTOMER AND DATE
    df['customer_ID'] = df['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
    df.S_2 = cudf.to_datetime( df.S_2 )
    # SORT BY CUSTOMER AND DATE (so agg('last') works correctly)
    #df = df.sort_values(['customer_ID','S_2'])
    #df = df.reset_index(drop=True)
    # FILL NAN
    df = df.fillna(NAN_VALUE) 
    print('shape of data:', df.shape)
    
    return df

print('Reading train data...')
train = read_file(path=os.path.join(CWD, 'train.parquet'))

def process_and_feature_engineer(df):
    # FEATURE ENGINEERING FROM 
    # https://www.kaggle.com/code/huseyincot/amex-agg-data-how-it-created
    all_cols = [c for c in list(df.columns) if c not in ['customer_ID','S_2']]
    cat_features = ["B_30","B_38","D_114","D_116","D_117","D_120","D_126","D_63","D_64","D_66","D_68"]
    num_features = [col for col in all_cols if col not in cat_features]

    test_num_agg = df.groupby("customer_ID")[num_features].agg(['mean', 'std', 'min', 'max', 'last'])
    test_num_agg.columns = ['_'.join(x) for x in test_num_agg.columns]

    test_cat_agg = df.groupby("customer_ID")[cat_features].agg(['count', 'last', 'nunique'])
    test_cat_agg.columns = ['_'.join(x) for x in test_cat_agg.columns]

    df = cudf.concat([test_num_agg, test_cat_agg], axis=1)
    del test_num_agg, test_cat_agg
    print('shape after engineering', df.shape )
    
    return df

train = process_and_feature_engineer(train)
train = train.fillna(NAN_VALUE)

# ADD TARGETS
targets = cudf.read_csv(os.path.join(CWD, 'train_labels.csv'))
targets['customer_ID'] = targets['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
targets = targets.set_index('customer_ID')
train = train.merge(targets, left_index=True, right_index=True, how='left')
train.target = train.target.astype('int8')
del targets

# NEEDED TO MAKE CV DETERMINISTIC (cudf merge above randomly shuffles rows)
train = train.sort_index().reset_index()

# FEATURES
FEATURES = list(train.columns[1:-1])
print(f'There are {len(FEATURES)} features!')

Reading train data...
shape of data: (5531451, 190)
shape after engineering (458913, 918)
There are 918 features!


In [9]:
# ADD CUST SPLITS
CUST_SPLITS = cudf.read_csv('CUST_SPLITS.csv').set_index('customer_ID')
train = train.set_index('customer_ID').merge(CUST_SPLITS, left_index=True, right_index=True, how='left')
test = train[train['fold'] == 9999]
train = train[train['fold'] != 9999]
train = train.sort_index().reset_index()
train.head()

Unnamed: 0,customer_ID,P_2_mean,P_2_std,P_2_min,P_2_max,P_2_last,D_39_mean,D_39_std,D_39_min,D_39_max,...,D_64_last,D_64_nunique,D_66_count,D_66_last,D_66_nunique,D_68_count,D_68_last,D_68_nunique,target,fold
0,-9223193039457028513,0.974068,0.013094,0.964483,1.002478,1.001372,0.0,0.0,0,0,...,0,1,13,-1,1,13,6,1,0,1
1,-9223189665817919541,0.802447,0.038025,0.694073,0.828761,0.694073,0.0,0.0,0,0,...,0,1,13,-1,1,13,6,1,0,4
2,-9223188534444851899,0.791203,0.002688,0.786647,0.794826,0.787945,0.0,0.0,0,0,...,3,2,13,-1,1,13,5,1,0,4
3,-9223173911659837606,0.115666,0.078554,0.038207,0.252421,0.040486,4.384615,6.144625,0,17,...,0,2,13,-1,1,13,6,2,1,0
4,-9223126996485486147,0.978507,0.029026,0.910546,1.009644,1.009644,0.076923,0.27735,0,1,...,0,1,13,-1,1,13,6,1,0,3


# Permutation

In [10]:
def get_permutation_importance(model_path, data, model_features, shuffle_features, shuffle_times=5):
    permutation_importance = pd.DataFrame(columns=['feature', 'metric', 'shuffle_idx'])
    model = ForestInference.load(model_path, model_type='xgboost', output_class=True)
    preds = model.predict_proba(data[model_features])[1].values
    acc = fast_auc(data['target'].values, preds)
    permutation_importance.loc[permutation_importance.shape[0]] = ['original', cupy.asnumpy(acc), 0] 
    for col in tqdm(shuffle_features):
        value = data[col].copy().values
        for i in np.arange(shuffle_times):
            np.random.seed(i)
            data[col] = cupy.random.permutation(data[col].copy().values)
            preds = model.predict_proba(data[model_features])[1].values
            new_acc = fast_auc(data['target'].values, preds)
            permutation_importance.loc[permutation_importance.shape[0]] = [col, cupy.asnumpy(new_acc), i] 
        data[col] = value
    return permutation_importance

def agg_permu_files(permu_dir, count_threshold, retrain_idx):
    permu_files = [c for c in os.listdir(permu_dir) if ('.csv' in c) and (f'permutation_importance_{retrain_idx}' in c)]
    permutation_importance_all = []
    for file in permu_files:
        df = pd.read_csv(os.path.join(permu_dir, file))
        df['permut_idx'] = file.split('_')[-2]
        permutation_importance_all.append(df)
    permutation_importance_all = pd.concat(permutation_importance_all)
    # original_acc
    original_acc = permutation_importance_all[permutation_importance_all['feature'] == 'original'].set_index(['fold', 'permut_idx'])[['metric']]
    original_acc.rename({'metric': 'metric_ori'}, axis=1, inplace=True)
    permutation_importance_all = permutation_importance_all.set_index(['fold', 'permut_idx']).merge(original_acc, left_index=True, right_index=True, how='left')
    permutation_importance_all['metric_diff_ratio'] = (permutation_importance_all['metric_ori'] - permutation_importance_all['metric']) / permutation_importance_all['metric_ori']
    permutation_importance_all.reset_index(inplace=True)
    # random_acc
    random_acc = permutation_importance_all[permutation_importance_all['feature'] == 'random'].groupby(['permut_idx'])[['metric_diff_ratio']].agg(['mean', 'std'])
    random_acc.columns = ['random_mean', 'random_std']
    permutation_importance_all.reset_index(inplace=True)

    permutation_importance_agg = permutation_importance_all[permutation_importance_all['feature'] != 'random'].groupby(['feature', 'permut_idx'])['metric_diff_ratio'].agg(['min', 'max','mean', 'std', 'count'])
    permutation_importance_agg['z'] = permutation_importance_agg['mean'] / permutation_importance_agg['std']
    if count_threshold:
        permutation_importance_agg = permutation_importance_agg[permutation_importance_agg['count'] == count_threshold]
    permutation_importance_agg = permutation_importance_agg.reset_index().set_index('permut_idx')
    permutation_importance_agg['z'] = permutation_importance_agg['mean'] / permutation_importance_agg['std']
    permutation_importance_agg = permutation_importance_agg.merge(random_acc, left_index=True, right_index=True, how='left')
    permutation_importance_agg['random_z'] = permutation_importance_agg['random_mean'] / permutation_importance_agg['random_std']
    permutation_importance_agg = permutation_importance_agg.reset_index().set_index('feature')
    return permutation_importance_agg

In [11]:
params_13m = {
    'max_depth':4, 
    'learning_rate':0.1, 
    'subsample':0.8,
    'colsample_bytree':0.1,
}

In [12]:
# ADD RANDOM FEATURE
np.random.seed(SEED)
train['random'] = np.random.normal(loc=1, scale=1, size=train.shape[0])

In [13]:
np.random.seed(SEED)
np.random.shuffle(FEATURES)
FEATURES = FEATURES[:300]
print(FEATURES[:10])

['D_107_max', 'D_41_mean', 'D_77_min', 'R_19_mean', 'D_131_min', 'B_41_max', 'D_76_max', 'D_91_max', 'D_54_mean', 'D_84_max']


In [14]:
permu_dir = './permut_13m_ver001_retrain'
if not os.path.isdir(permu_dir): os.mkdir(permu_dir)
clear_file(permu_dir)

retrain_idx = 1
val_folds = [0,1,2,3,4,5,6,7,8,9]
shuffle_times = 10

while True:
    # Features
    sub_features_num = len(FEATURES)
    feature_split_num = max(round(len(FEATURES) / sub_features_num), 1) 
    sub_features_num = len(FEATURES) // feature_split_num
    print(f'FEATURES: {len(FEATURES)}; retrain_idx: {retrain_idx}; feature_split_num: {feature_split_num}; sub_features_num: {sub_features_num}')
    start_idx = 0
    for sub_features_idx in range(feature_split_num):
        if sub_features_idx == feature_split_num - 1:
            end_idx = len(FEATURES)
        else:
            end_idx = start_idx+sub_features_num
        sub_features = FEATURES[start_idx: end_idx]
        # Train
        acc, oof, importances = train_xgb_params(params_13m, 
                                                 train,
                                                 features=sub_features+['random'],
                                                 val_folds=val_folds,
                                                 early_stopping_rounds=100, 
                                                 num_boost_round=9999,  
                                                 verbose_eval=False, 
                                                 data_seed=SEED, 
                                                 model_seed=SEED, 
                                                 save_dir=permu_dir,
                                                 ver=f'13M_{retrain_idx}_{sub_features_idx}')
        importances.to_csv(os.path.join(permu_dir, 'original_importances.csv'))
        # Permutation Importance
        permutation_importance_list = []
        for fold in val_folds:
            print(f'........ sub_features_idx: {sub_features_idx}; start_idx: {start_idx}, end_idx: {end_idx}, fold: {fold};')
            model_file = [c for c in os.listdir(permu_dir) if f'13M_{retrain_idx}_{sub_features_idx}_fold{fold}' in c]
            if len(model_file) > 1:
                raise ValueError(f'There are more than one model file: {model_file}')
            model_file = model_file[0]
            model = xgb.Booster()
            model_path = os.path.join(permu_dir, model_file)
            # PERMUTATION IMPORTANCE
            importances_fold = importances[f'importance_{fold}']
            shuffle_features = importances_fold[importances_fold.notnull()].index.to_list()
            permutation_importance = get_permutation_importance(model_path,
                                                                train.loc[train['fold'] == fold, :],
                                                                model_features=sub_features+['random'], 
                                                                shuffle_features=shuffle_features,
                                                                shuffle_times=shuffle_times) 
            permutation_importance['retrain_idx'] = retrain_idx                                                   
            permutation_importance['sub_features_idx'] = sub_features_idx                                                    
            permutation_importance['fold'] = fold
            permutation_importance.to_csv(os.path.join(permu_dir, f'permutation_importance_{retrain_idx}_{sub_features_idx}_fold{fold}.csv'), index=False)        
            permutation_importance_list.append(permutation_importance)
        start_idx = end_idx
    permutation_importance_agg = agg_permu_files(permu_dir, count_threshold=len(val_folds)*shuffle_times, retrain_idx=retrain_idx)
    drop_features = permutation_importance_agg[permutation_importance_agg['z'] < permutation_importance_agg['random_z']].index.to_list()
    permutation_importance_agg['is_drop'] = permutation_importance_agg.index.isin(drop_features)
    permutation_importance_agg.to_csv(os.path.join(permu_dir, f'permutation_importance_agg_{retrain_idx}.csv'))
    FEATURES = [c for c in FEATURES if c not in drop_features]
    retrain_idx += 1
    break
    # if len(drop_features) == 0:
    #     break

FEATURES: 300; retrain_idx: 1; feature_split_num: 1; sub_features_num: 300
........ sub_features_idx: 0; start_idx: 0, end_idx: 300, fold: 0;


100%|██████████| 290/290 [02:36<00:00,  1.85it/s]


........ sub_features_idx: 0; start_idx: 0, end_idx: 300, fold: 1;


100%|██████████| 289/289 [02:40<00:00,  1.80it/s]


........ sub_features_idx: 0; start_idx: 0, end_idx: 300, fold: 2;


100%|██████████| 285/285 [02:41<00:00,  1.77it/s]


........ sub_features_idx: 0; start_idx: 0, end_idx: 300, fold: 3;


100%|██████████| 287/287 [02:44<00:00,  1.75it/s]


........ sub_features_idx: 0; start_idx: 0, end_idx: 300, fold: 4;


100%|██████████| 288/288 [02:45<00:00,  1.74it/s]


........ sub_features_idx: 0; start_idx: 0, end_idx: 300, fold: 5;


100%|██████████| 288/288 [02:36<00:00,  1.84it/s]


........ sub_features_idx: 0; start_idx: 0, end_idx: 300, fold: 6;


100%|██████████| 287/287 [02:38<00:00,  1.81it/s]


........ sub_features_idx: 0; start_idx: 0, end_idx: 300, fold: 7;


100%|██████████| 286/286 [02:37<00:00,  1.81it/s]


........ sub_features_idx: 0; start_idx: 0, end_idx: 300, fold: 8;


100%|██████████| 289/289 [02:39<00:00,  1.82it/s]


........ sub_features_idx: 0; start_idx: 0, end_idx: 300, fold: 9;


100%|██████████| 289/289 [02:39<00:00,  1.82it/s]


# Baseline

In [16]:
val_folds = [0,1,2,3,4,5,6,7,8,9]
acc, oof, importances = train_xgb_params(params_13m, 
                                     train,
                                     features=FEATURES,
                                     val_folds=val_folds,
                                     early_stopping_rounds=100, 
                                     num_boost_round=9999,  
                                     verbose_eval=False, 
                                     data_seed=SEED, 
                                     model_seed=SEED, 
                                     save_dir=False)
print('num_features', len(FEATURES), 'auc', acc) 

num_features 300 auc 0.9597397304150984


# 筛选变量实验

In [15]:
val_folds = [0,1,2,3,4,5,6,7,8,9]

## 默认的Feature Importance

In [22]:
original_importances = pd.read_csv(os.path.join(permu_dir, 'original_importances.csv'))
original_importances = original_importances.set_index('feature')
original_importances['mean'] = original_importances.mean(axis=1)
original_importances['std'] = original_importances.std(axis=1)
original_importances.reset_index(inplace=True)

In [33]:
features_mean = original_importances.sort_values('mean', ascending=False)['feature'].to_list()
num_features = 25
while True:
    acc, oof, importances = train_xgb_params(params_13m, 
                                         train,
                                         features=features_mean[:num_features],
                                         val_folds=val_folds,
                                         early_stopping_rounds=100, 
                                         num_boost_round=9999,  
                                         verbose_eval=False, 
                                         data_seed=SEED, 
                                         model_seed=SEED, 
                                         save_dir=False)
    print('num_features', num_features, 'auc', acc) 
    num_features += 25
    if num_features > permutation_importance_agg.shape[0]:
        break

num_features 25 auc 0.9519786413209202
num_features 50 auc 0.9551949305682249
num_features 75 auc 0.957583263622448
num_features 100 auc 0.9581601072672994
num_features 125 auc 0.958722652426215
num_features 150 auc 0.959060598689679
num_features 175 auc 0.9593140320399974
num_features 200 auc 0.9595746364492562
num_features 225 auc 0.9596106789864496
num_features 250 auc 0.959798236138387


## 平均下降比例

In [20]:
permutation_importance_agg = pd.read_csv(os.path.join(permu_dir, 'permutation_importance_agg_1.csv'))
features_mean = permutation_importance_agg.sort_values('mean', ascending=False)['feature'].to_list()
num_features = 25
while True:
    acc, oof, importances = train_xgb_params(params_13m, 
                                         train,
                                         features=features_mean[:num_features],
                                         val_folds=val_folds,
                                         early_stopping_rounds=100, 
                                         num_boost_round=9999,  
                                         verbose_eval=False, 
                                         data_seed=SEED, 
                                         model_seed=SEED, 
                                         save_dir=False)
    print('num_features', num_features, 'auc', acc) 
    num_features += 25
    if num_features > permutation_importance_agg.shape[0]:
        break

num_features 25 auc 0.955488431492996
num_features 50 auc 0.9582052598376949
num_features 75 auc 0.9589016974455882
num_features 100 auc 0.9593989000375628
num_features 125 auc 0.9595675126936822
num_features 150 auc 0.9597684801181003
num_features 175 auc 0.9598145119359532
num_features 200 auc 0.9597407810213192
num_features 225 auc 0.9598664237091002
num_features 250 auc 0.959691966613858


## 标准化后的平均下降比例

In [21]:
permutation_importance_agg = pd.read_csv(os.path.join(permu_dir, 'permutation_importance_agg_1.csv'))
features_z = permutation_importance_agg.sort_values('z', ascending=False)['feature'].to_list()
num_features = 25
while True:
    acc, oof, importances = train_xgb_params(params_13m, 
                                         train,
                                         features=features_z[:num_features],
                                         val_folds=val_folds,
                                         early_stopping_rounds=100, 
                                         num_boost_round=9999,  
                                         verbose_eval=False, 
                                         data_seed=SEED, 
                                         model_seed=SEED, 
                                         save_dir=False)
    print('num_features', num_features, 'auc', acc) 
    num_features += 25
    if num_features > permutation_importance_agg.shape[0]:
        break

num_features 25 auc 0.9556982881629269
num_features 50 auc 0.9584120117990971
num_features 75 auc 0.9589902318791576
num_features 100 auc 0.9595072118772046
num_features 125 auc 0.9597150474489736
num_features 150 auc 0.9597403596967278
num_features 175 auc 0.9599117083347268
num_features 200 auc 0.9599500114626974
num_features 225 auc 0.9598069184316719
num_features 250 auc 0.959729845054988
