In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from matplotlib import pyplot as plt

import torch
import torch.nn as nn
from pytorch_tabnet.tab_model import TabNetClassifier

from sklearn.model_selection import train_test_split

import optuna
from sklearn.model_selection import KFold

In [None]:
train = pd.read_csv("../data/minseok_EDA2_train.csv")
test = pd.read_csv("../data/minseok_EDA2_test.csv")

In [None]:
train.fillna(method='ffill', inplace=True)
test.fillna(method='ffill', inplace=True)

In [None]:
vcl = train[train.columns[39:55]]
vcl_test = test[test.columns[39:55]]

In [None]:
vcl['VCL_cal'] = vcl.sum(axis=1) / 16
vcl_test['VCL_cal'] = vcl_test.sum(axis=1) / 16

In [None]:
train = pd.concat([train, vcl['VCL_cal']], axis=1)
test = pd.concat([test, vcl_test['VCL_cal']], axis=1)

In [None]:
columns = [col for col in train.columns.to_list() if col not in ['nerdiness']]

In [None]:
data = train[columns]
target = train['nerdiness']

In [None]:
X = train.drop(['nerdiness'], axis=1).values
y = train['nerdiness'].values
#y = y.reshape(-1, 1)

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42)

In [None]:
# X_train = X_train.to_numpy()
# y_train = y_train.to_numpy().squeeze()
# X_test = X_test.to_numpy()
# y_test = y_test.to_numpy().squeeze()

In [None]:
# clf = TabNetClassifier(
#                        cat_emb_dim=5,
#                        optimizer_fn=torch.optim.Adam,
#                        optimizer_params=dict(lr=1e-2),
#                        scheduler_params={"step_size":30,
#                                          "gamma":0.9},
#                        scheduler_fn=torch.optim.lr_scheduler.StepLR,
#                        mask_type='sparsemax' # "sparsemax", entmax
#                       )

In [None]:
# max_epochs = 50

# clf.fit(
#     X_train=X_train, y_train=y_train,
#     eval_set=[(X_train, y_train), (X_test, y_test)],
#     eval_name=['train', 'valid'],
#     eval_metric=['auc'],
#     max_epochs=max_epochs , patience=20,
#     batch_size=2048, virtual_batch_size=256,
#     num_workers=0,
#     weights=1,
#     drop_last=True,
# )

In [None]:
def Objective(trial):
    mask_type = trial.suggest_categorical("mask_type", ["entmax", "sparsemax"])
    n_da = trial.suggest_int("n_da", 56, 64, step=4)
    n_steps = trial.suggest_int("n_steps", 1, 3, step=1)
    gamma = trial.suggest_float("gamma", 1., 1.4, step=0.2)
    n_shared = trial.suggest_int("n_shared", 1, 3)
    lambda_sparse = trial.suggest_float("lambda_sparse", 1e-6, 1e-3, log=True)
    tabnet_params = dict(n_d=n_da, n_a=n_da, n_steps=n_steps, gamma=gamma,
                     lambda_sparse=lambda_sparse, optimizer_fn=torch.optim.Adam,
                     optimizer_params=dict(lr=2e-2, weight_decay=1e-5),
                     mask_type=mask_type, n_shared=n_shared,
                     scheduler_params=dict(mode="min",
                                           patience=trial.suggest_int("patienceScheduler",low=3,high=10), # changing sheduler patience to be lower than early stopping patience 
                                           min_lr=1e-5,
                                           factor=0.5,),
                     scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
                     verbose=0,
                     ) #early stopping
    kf = KFold(n_splits=3, random_state=42, shuffle=True)
    CV_score_array    =[]
    for train_index, test_index in kf.split(X):
        X_train, X_valid = X[train_index], X[test_index]
        y_train, y_valid = y[train_index], y[test_index]
        clf = TabNetClassifier(**tabnet_params)
        clf.fit(X_train=X_train, y_train=y_train,
                  eval_set=[(X_valid, y_valid)],
                  patience=trial.suggest_int("patience",low=15,high=30), max_epochs=trial.suggest_int('epochs', 1, 100),
                  eval_metric=['auc'])
        CV_score_array.append(clf.best_cost)
    avg = np.mean(CV_score_array)
    return avg

In [None]:
study = optuna.create_study(direction="maximize", study_name='TabNet optimization')
study.optimize(Objective, n_trials=20) # timeout=6*60

In [None]:
TabNet_params = study.best_params

In [None]:
print(TabNet_params)

In [None]:
final_params = dict(n_d=TabNet_params['n_da'], n_a=TabNet_params['n_da'], n_steps=TabNet_params['n_steps'], gamma=TabNet_params['gamma'],
                     lambda_sparse=TabNet_params['lambda_sparse'], optimizer_fn=torch.optim.Adam,
                     optimizer_params=dict(lr=2e-2, weight_decay=1e-5),
                     mask_type=TabNet_params['mask_type'], n_shared=TabNet_params['n_shared'],
                     scheduler_params=dict(mode="min",
                                           patience=TabNet_params['patienceScheduler'],
                                           min_lr=1e-5,
                                           factor=0.5,),
                     scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
                     verbose=0,
                     )
epochs = TabNet_params['epochs']

In [None]:
clf = TabNetClassifier(**final_params)
clf.fit(X_train=X, y_train=y,
          patience=TabNet_params['patience'], max_epochs=epochs,
          eval_metric=['auc'])

In [None]:
X_test = test.values
sub = pd.read_csv("../data/sample_submission.csv")

In [None]:
sub['nerdiness']=clf.predict(X_test)
sub.to_csv('../submission/tabnet.csv', index=False)