In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import math
import os
import pyarrow.feather as feather
from sklearn.model_selection import StratifiedKFold
from sklearn import preprocessing
import torch
import random
from pytorch_tabnet.tab_model import TabNetClassifier
from pytorch_tabnet.metrics import Metric

import gc
import warnings
#warnings.filterwarnings("ignore")

In [2]:
#skip make_data
data_use=pd.read_feather("../Kaggle/AMEX/train_use.ftr")
train_labels=pd.read_csv("../Kaggle/AMEX/train_labels.csv")
print(data_use.shape)
print(train_labels.shape)

def categorization(data):

    num_cols=data._get_numeric_data().columns
    cat_cols=list(set(data.columns) - set(num_cols))

    for column in cat_cols:
        target_column = data[column]
        le = preprocessing.LabelEncoder()
        le.fit(target_column)
        label_encoded_column = le.transform(target_column)
        data[column] = pd.Series(label_encoded_column).astype('category')
    
    return data

data_use=categorization(data_use)

(458913, 2033)
(458913, 2)


In [3]:
#data_use=data_use.iloc[0:1000,0:40]
#train_labels=train_labels.iloc[0:1000,:]


In [3]:
class CFG:
    DEBUG = False
    model = 'tabnet'
    N_folds = 5
    seed = 42
    batch_size = 512
    max_epochs = 60

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(seed = CFG.seed)

In [5]:
def amex_metric_numpy(y_true: np.array, y_pred: np.array) -> float:

    # count of positives and negatives
    n_pos = y_true.sum()
    n_neg = y_true.shape[0] - n_pos

    # sorting by descring prediction values
    indices = np.argsort(y_pred)[::-1]
    preds, target = y_pred[indices], y_true[indices]

    # filter the top 4% by cumulative row weights
    weight = 20.0 - target * 19.0
    cum_norm_weight = (weight / weight.sum()).cumsum()
    four_pct_filter = cum_norm_weight <= 0.04

    # default rate captured at 4%
    d = target[four_pct_filter].sum() / n_pos

    # weighted gini coefficient
    lorentz = (target / n_pos).cumsum()
    gini = ((lorentz - cum_norm_weight) * weight).sum()

    # max weighted gini coefficient
    gini_max = 10 * n_neg * (1 - 19 / (n_pos + 20 * n_neg))

    # normalized weighted gini coefficient
    g = gini / gini_max

    return 0.5 * (g + d)

In [6]:
class Amex_tabnet(Metric):
    
  def __init__(self):
    self._name = 'amex_tabnet'
    self._maximize = True

  def __call__(self, y_true, y_pred):
    amex = amex_metric_numpy(y_true, y_pred[:, 1])
    return max(amex, 0.)

In [7]:
#TabNet
# Create out of folds array
oof_predictions = np.zeros((data_use.shape[0]))
#test_predictions = np.zeros(test.shape[0])
feature_importances = pd.DataFrame()
feature_importances["feature"] = data_use.columns.tolist()
stats = pd.DataFrame()
explain_matrices = []
masks_ =[]


    
kfold = StratifiedKFold(n_splits = CFG.N_folds, shuffle=True, random_state = CFG.seed)

for fold, (train_idx, valid_idx) in enumerate(kfold.split(data_use, train_labels["target"])):

    ## DEBUG MODE
    if CFG.DEBUG == True:
        if fold > 0:
            print('\nDEBUG mode activated: Will train only one fold...\n')
            break      

    X_train, y_train = data_use.iloc[train_idx,:], train_labels["target"].iloc[train_idx]
    X_valid, y_valid = data_use.iloc[valid_idx,:], train_labels["target"].iloc[valid_idx]     
        
    model = TabNetClassifier(n_d = 32,
                             n_a = 32,
                             n_steps = 3,
                             gamma = 1.3,
                             n_independent = 2,
                             n_shared = 2,
                             momentum = 0.02,
                             clip_value = None,
                             lambda_sparse = 1e-3,
                             optimizer_fn = torch.optim.Adam,
                             optimizer_params = dict(lr = 1e-3, weight_decay=1e-3),
                             scheduler_fn = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts,
                             scheduler_params = {'T_0':5,
                                                 'eta_min':1e-4,
                                                 'T_mult':1,
                                                 'last_epoch':-1},
                             mask_type = 'entmax',
                             seed = CFG.seed)
    
    

    ## train
    model.fit(np.array(X_train),
              np.array(y_train.values.ravel()),
              eval_set = [(np.array(X_valid), np.array(y_valid.values.ravel()))],
              max_epochs = CFG.max_epochs,
              patience = 50,
              batch_size = CFG.batch_size,
              eval_metric = ['auc', 'accuracy', Amex_tabnet]) # Last metric is used for early stopping
    
    # Saving best model
    saving_path_name = f"./fold{fold}"
    saved_filepath = model.save_model(saving_path_name)
    
    # model explanability
    explain_matrix, masks = model.explain(X_valid.values)
    explain_matrices.append(explain_matrix)
    masks_.append(masks[0])
    masks_.append(masks[1])
    
    # Inference
    oof_predictions[valid_idx] = model.predict_proba(X_valid.values)[:, 1]
    
    #if CFG
    # logodds function
    
    #test_predictions += model.predict_proba(test.values)[:, 1]/5
    feature_importances[f"importance_fold{fold}+1"] = model.feature_importances_
    
    # Loss , metric tracking
    stats[f'fold{fold+1}_train_loss'] = model.history['loss']
    stats[f'fold{fold+1}_val_metric'] = model.history['val_0_amex_tabnet']


     
    print(f'\nFold {fold+1}/{CFG.N_folds}' )

    ### free memory
    del X_train, y_train
    del X_valid, y_valid
    gc.collect()

print(f'OOF score across folds: {amex_metric_numpy(train_labels["target"], oof_predictions.flatten())}')

Device used : cpu
epoch 0  | loss: 0.55506 | val_0_auc: 0.91432 | val_0_accuracy: 0.84036 | val_0_amex_tabnet: 0.62302 |  0:13:06s
epoch 1  | loss: 0.37439 | val_0_auc: 0.928   | val_0_accuracy: 0.86273 | val_0_amex_tabnet: 0.66821 |  0:25:57s
epoch 2  | loss: 0.33387 | val_0_auc: 0.93325 | val_0_accuracy: 0.86728 | val_0_amex_tabnet: 0.69043 |  0:38:48s
epoch 3  | loss: 0.31245 | val_0_auc: 0.93784 | val_0_accuracy: 0.87437 | val_0_amex_tabnet: 0.70421 |  0:51:44s
epoch 4  | loss: 0.30035 | val_0_auc: 0.94011 | val_0_accuracy: 0.87715 | val_0_amex_tabnet: 0.71328 |  1:04:49s
epoch 5  | loss: 0.28304 | val_0_auc: 0.94783 | val_0_accuracy: 0.88647 | val_0_amex_tabnet: 0.74411 |  1:17:39s
epoch 6  | loss: 0.26351 | val_0_auc: 0.95032 | val_0_accuracy: 0.88977 | val_0_amex_tabnet: 0.75083 |  1:30:18s
epoch 7  | loss: 0.25625 | val_0_auc: 0.95238 | val_0_accuracy: 0.89125 | val_0_amex_tabnet: 0.75887 |  1:43:25s
epoch 8  | loss: 0.25137 | val_0_auc: 0.95313 | val_0_accuracy: 0.89336 | val_

KeyboardInterrupt: 