In [1]:
!pip install pytorch-tabnet==3.1.1

Collecting pytorch-tabnet==3.1.1
  Downloading pytorch_tabnet-3.1.1-py3-none-any.whl (39 kB)
Installing collected packages: pytorch-tabnet
Successfully installed pytorch-tabnet-3.1.1


In [2]:
import numpy as np
import pandas as pd
import torch

from torch import nn
from pytorch_tabnet.tab_model  import TabNetClassifier 
from pytorch_tabnet.metrics import Metric
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder

In [11]:
train_df = pd.read_csv('./drive/MyDrive/dacon/jobcare_211207/dataset/train.csv')
test_df = pd.read_csv('./drive/MyDrive/dacon/jobcare_211207/dataset/test.csv')

In [12]:
train = train_df[train_df['contents_open_dt'].apply(lambda x: pd.Timestamp(x).month)<11].copy()
val = train_df[train_df['contents_open_dt'].apply(lambda x: pd.Timestamp(x).month)==11].copy()
test = test_df.copy()

# 간단한 Preprocessing

In [13]:
for df in [train,val,test]:
    df.drop(['contents_open_dt','contents_rn','id','person_rn','contents_open_dt','person_prefer_f','person_prefer_g'],axis=1,inplace=True)

columns = sorted(test.columns)
train = train[columns+['target']]*1
val = val[columns+['target']]*1
test = test[columns]*1

In [14]:
cat_idxs = []
cat_dims = []
for idx, col in enumerate(train.columns):
    if 'match' not in col and col!='target': 
        le = LabelEncoder()
        le.fit(train_df[col].values)
        le_dict = dict(zip(le.classes_, le.transform(le.classes_)))
        train[col] = train[col].apply(lambda x: le_dict.get(x, len(le_dict)))
        val[col] = val[col].apply(lambda x: le_dict.get(x, len(le_dict)))
        test[col] = test[col].apply(lambda x: le_dict.get(x, len(le_dict)))
        cat_idxs.append(idx)
        cat_dims.append(len(le_dict)+1)

In [15]:
X_train = train.drop('target',axis=1).values
y_train = train['target'].values
X_val = val.drop('target',axis=1).values
y_val = val['target'].values
X_test = test.values
eval_set = (X_val,y_val)

In [16]:
clf = TabNetClassifier(cat_idxs=cat_idxs,
                       cat_dims=cat_dims,
                       cat_emb_dim=3,
                       optimizer_fn=torch.optim.AdamW, # Any optimizer works here
                       mask_type='entmax', # "sparsemax",
                      )

Device used : cuda


In [17]:
class F1_Score(Metric):
    def __init__(self):
        self._name = "f1"
        self._maximize = True

    def __call__(self, y_true, y_score):
        score = f1_score(y_true, (y_score[:, 1]>0.5)*1)
        return score

In [18]:
clf.fit(
    X_train=X_train, y_train=y_train,
    eval_set=[(X_train, y_train), (X_val, y_val)],
    eval_name=['train', 'val'],
    eval_metric=['logloss','f1'],
    max_epochs=100 , patience=2,
    batch_size=1024,
    virtual_batch_size=256,
    num_workers=1,
    drop_last=False,
) 

epoch 0  | loss: 0.68259 | train_logloss: 0.67297 | train_f1: 0.59623 | val_logloss: 0.67539 | val_f1: 0.59006 |  0:00:23s
epoch 1  | loss: 0.66988 | train_logloss: 0.66206 | train_f1: 0.64045 | val_logloss: 0.66967 | val_f1: 0.62884 |  0:00:46s
epoch 2  | loss: 0.65873 | train_logloss: 0.65048 | train_f1: 0.65304 | val_logloss: 0.65892 | val_f1: 0.63797 |  0:01:09s
epoch 3  | loss: 0.6505  | train_logloss: 0.64358 | train_f1: 0.66452 | val_logloss: 0.65671 | val_f1: 0.64603 |  0:01:32s
epoch 4  | loss: 0.64531 | train_logloss: 0.63756 | train_f1: 0.63213 | val_logloss: 0.65294 | val_f1: 0.61249 |  0:01:55s
epoch 5  | loss: 0.64118 | train_logloss: 0.63262 | train_f1: 0.65644 | val_logloss: 0.65207 | val_f1: 0.63283 |  0:02:18s

Early stopping occurred at epoch 5 with best_epoch = 3 and best_val_f1 = 0.64603
Best weights from best epoch are automatically used!


In [19]:
preds = clf.predict_proba(X_test)
preds = (preds[:,1]>0.5)*1

In [20]:
submission = pd.read_csv('./drive/MyDrive/dacon/jobcare_211207/dataset/sample_submission.csv')
submission['target'] = preds

In [21]:
submission.tail()

Unnamed: 0,id,target
46399,46399,1
46400,46400,1
46401,46401,1
46402,46402,1
46403,46403,1


In [23]:
submission.to_csv('./drive/MyDrive/dacon/jobcare_211207/mysubmission.csv',index=False)