## **TabNet Training**

In [3]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import os
import glob
import pickle

from tqdm import tqdm
import torch
from pytorch_tabnet.tab_model import TabNetClassifier
import copy
import warnings
warnings.simplefilter('ignore')

from sklearn.preprocessing import LabelEncoder

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### **Utils**

In [4]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns: #columns毎に処理
        col_type = df[col].dtypes
        if col_type in numerics: #numericsのデータ型の範囲内のときに処理を実行. データの最大最小値を元にデータ型を効率的なものに変更
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [5]:
base_dir = os.path.join("..","..")
train_df = pd.read_csv(os.path.join(base_dir,"input","riiid-test-answer-prediction","train.csv"), low_memory=False,index_col=0)

In [6]:
train_df["prior_question_had_explanation"] = train_df["prior_question_had_explanation"].astype("float")
train_df = train_df.dropna()
train_df = reduce_mem_usage(train_df)

Mem. usage decreased to 3111.84 Mb (58.8% reduction)


### **training**

In [7]:
target = "answered_correctly"
features = ['timestamp',
            'user_id',
            'content_id',
            'content_type_id',
            'task_container_id',
            #'user_answer',
            #'answered_correctly',
            'prior_question_elapsed_time',
            'prior_question_had_explanation'
           ]

In [8]:
sample_num = 10000000
train_df = train_df.sample(n=sample_num, random_state=0)

In [9]:
import random

class CFG:
    START_IDX = 90000000
    SEED = 42
    TEST_SIZE = 0.2
    N_EPOCHS = 10
    BATCH_SZ = 256
    PATIENCE = 3
    VIRTUAL_BS = 128
    LR = 0.01
    ND = 8  # Width of the decision prediction layer. Bigger values gives more capacity to the model with the risk of overfitting. 
    NA = 8  # Width of the attention embedding for each mask. According to the paper n_d=n_a is usually a good choice. 
    N_STEPS = 3 # Number of steps in the architecture (usually between 3 and 10)
    GAMMA = 1.3 # This is the coefficient for feature reusage in the masks. A value close to 1 will make mask selection least correlated between layers. 
    #Values range from 1.0 to 2.0.
    N_INDEPENDENT = 1 # Number of independent Gated Linear Units layers at each step. Usual values range from 1 to 5.
    LAMBDA = 0
    N_SHARED = 3 # Number of shared Gated Linear Units at each step Usual values range from 1 to 5
    MOMENTUM = 0.1
    CLIP = 1.0
    MASK_TYPE = 'sparsemax' #(default='sparsemax') Either "sparsemax" or "entmax" : this is the masking function to use for selecting features

def seed_everything(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    os.environ['PYTHONHASHSEED'] = str(seed_value)
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = False
        torch.backends.cudnn.benchmark = False
        
seed_everything(CFG.SEED)

In [10]:
cat_dims = []
cat_idxs = []

for col in []:
    cat_idxs.append(train_df[features].columns.get_loc(col))
    cat_dims.append( (train_df[features])[col].nunique())

print(cat_idxs,cat_dims)

[] []


In [21]:
# training TabNet
model = TabNetClassifier(
                         n_d = CFG.ND,
                         n_a = CFG.NA,
                         n_steps = CFG.N_STEPS,
                         gamma = CFG.GAMMA, 
                         n_independent = CFG.N_INDEPENDENT,
                         n_shared = CFG.N_SHARED,
                         cat_dims=cat_dims,
                         cat_emb_dim=1,
                         optimizer_params=dict(lr=CFG.LR),
                         momentum=CFG.MOMENTUM,
                         cat_idxs=cat_idxs,
                         verbose=1,
                         #scheduler_params=dict(milestones=[20, 50, 80], gamma=0.5), 
                         #scheduler_fn=torch.optim.lr_scheduler.MultiStepLR,
                         mask_type = CFG.MASK_TYPE,
                         lambda_sparse = CFG.LAMBDA,
                         clip_value = CFG.CLIP,
                         device_name = "cuda"
                        )

model.fit(
          X_train = train_df[features].values, 
          y_train = train_df[target].values, 
          max_epochs = CFG.N_EPOCHS, 
          patience = CFG.PATIENCE,
          batch_size = CFG.BATCH_SZ, 
          virtual_batch_size = CFG.VIRTUAL_BS,
          num_workers = 0,
          weights = 1,
          drop_last = False
         )


# save model
dir = os.path.join(base_dir,"input","riiid-test-answer-prediction","riiid-featurev1-tabnet")
if not(os.path.exists(dir)):
    os.makedirs(dir)
model_path = os.path.join(dir,"featureV1-TabNet_trained_%dsamples%depoch_gpu"%(sample_num,CFG.N_EPOCHS))

saved_path = model.save_model(model_path)

os.rename(saved_path,model_path+".mdl")

Device used : cuda
No early stopping will be performed, last training weights will be used.
epoch 0  | loss: 0.69793 |  0:00:22s
epoch 1  | loss: 0.68633 |  0:00:44s
epoch 2  | loss: 0.68576 |  0:01:06s
epoch 3  | loss: 0.68511 |  0:01:28s
epoch 4  | loss: 0.6839  |  0:01:50s
epoch 5  | loss: 0.6844  |  0:02:12s
epoch 6  | loss: 0.68346 |  0:02:34s
epoch 7  | loss: 0.68395 |  0:02:56s
epoch 8  | loss: 0.68294 |  0:03:18s
epoch 9  | loss: 0.68287 |  0:03:40s
Successfully saved model at ..\..\input\riiid-test-answer-prediction\riiid-featurev1-tabnet\featureV1-TabNet_trained_1000000samples10epoch_gpu.zip


In [20]:
model.predict(test_df[features].values)

array([0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1], dtype=int8)

In [23]:
model.preds_mapper

{0: 0, 1: 1}