In [None]:
pip install pytorch-tabnet

In [None]:
# Preliminaries
import numpy as np
import pandas as pd 
import os
import random
import pickle
import gc

#Torch and Tabnet
import torch
from pytorch_tabnet.tab_model import TabNetRegressor

#Sklearn only for splitting
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder

<h1> Configuration <h1>

In [None]:
NUM_FOLDS = 7
seed = 2020

In [None]:
seed = 2020
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything(seed)

<h1> Data Preparation <h1>

In [None]:
folder_path = '../input/ieee-fraud-detection/'
train_identity = pd.read_csv(f'{folder_path}train_identity.csv')
train_transaction = pd.read_csv(f'{folder_path}train_transaction.csv')
test_identity = pd.read_csv(f'{folder_path}test_identity.csv')
test_transaction = pd.read_csv(f'{folder_path}test_transaction.csv')
sub = pd.read_csv(f'{folder_path}sample_submission.csv')

In [None]:
train = pd.merge(train_transaction, train_identity, on = 'TransactionID', how = 'left')
test = pd.merge(test_transaction, test_identity, on = 'TransactionID', how = 'left')

In [None]:
import gc
del train_identity, train_transaction, test_identity, test_transaction
gc.collect()

In [None]:
train["kfold"] = -1

train = train.sample(frac = 1, random_state = seed).reset_index(drop = True)

# Creating folds
kf = KFold(n_splits = NUM_FOLDS)

for fold, (trn_, val_) in enumerate(kf.split(X = train, y = train)):
    train.loc[val_, "kfold"] = fold

In [None]:
# the columns involving 'id_xx' in the test set is of different format
a = list(test.columns)
for i in range(len(a)):
    if a[i][0] == 'i' and a[i][1] == 'd':
        a[i] = 'id_' + a[i][3:]
test.columns = a

In [None]:
train = train.drop(['TransactionDT', 'TransactionID'], axis = 1)
test = test.drop(['TransactionDT', 'TransactionID'], axis = 1)

In [None]:
for col in train.columns:
    if col not in test.columns:
        print(col)

In [None]:
categorical_columns = ['id_12', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 'id_29',
                'id_30', 'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceType', 'DeviceInfo', 'ProductCD', 'card4', 'card6', 'M4','P_emaildomain',
                'R_emaildomain', 'card1', 'card2', 'card3',  'card5', 'addr1', 'addr2', 'M1', 'M2', 'M3', 'M5', 'M6', 'M7', 'M8', 'M9',
                'P_emaildomain_1', 'P_emaildomain_2', 'P_emaildomain_3', 'R_emaildomain_1', 'R_emaildomain_2', 'R_emaildomain_3']

categorical_dims = {}
for col in train.columns:
    if col != 'isFraud' and col != 'kfold':
        if col in categorical_columns:
            le = LabelEncoder()
            le.fit(list(train[col].astype(str).values) + list(test[col].astype(str).values))
            train[col] = le.transform(list(train[col].astype(str).values))
            test[col] = le.transform(list(test[col].astype(str).values))
            categorical_dims[col] = len(le.classes_)
#         else:
#             test[col].fillna(test[col].mean(), inplace = True)

In [None]:
unused_feat = ['kfold']
target_features = 'isFraud'
features = [col for col in train.columns if col not in unused_feat + [target_features]]
cat_idxs = [i for i , f in enumerate(features) if f in categorical_columns]
cat_dims = [categorical_dims[f] for i, f in enumerate(features) if f in categorical_columns]

In [None]:
import pickle
with open('train_test.pkl', 'wb') as file:  
    pickle.dump([train, test, unused_feat, target_features, features, cat_idxs, cat_dims], file)

In [None]:
from IPython.display import FileLink
FileLink(r'train_test.pkl')

<h1> Model <h1>

In [None]:
categorical_columns = ['id_12', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 'id_29',
                'id_30', 'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceType', 'DeviceInfo', 'ProductCD', 'card4', 'card6', 'M4','P_emaildomain',
                'R_emaildomain', 'card1', 'card2', 'card3',  'card5', 'addr1', 'addr2', 'M1', 'M2', 'M3', 'M5', 'M6', 'M7', 'M8', 'M9',
                'P_emaildomain_1', 'P_emaildomain_2', 'P_emaildomain_3', 'R_emaildomain_1', 'R_emaildomain_2', 'R_emaildomain_3']

In [None]:
from pytorch_tabnet.tab_model import TabNetClassifier

In [None]:
import pickle
train, test, unused_feat, target_features, features, cat_idxs, cat_dims  = pickle.load(open('../input/readydata/train_test.pkl', 'rb'))

In [None]:
clf = TabNetClassifier(cat_idxs=cat_idxs,
                    cat_dims=cat_dims,
                    cat_emb_dim=1,
                    optimizer_fn=torch.optim.Adam,
                    optimizer_params=dict(lr=2e-2),
                    mask_type='entmax', # "sparsemax"
                    device_name = 'xla',
                    n_d = 16,
                    n_a = 16, 
                    n_steps = 4,
                    gamma = 1.3,
                    n_independent=2,
                    n_shared=2,
                    seed = seed,
                    scheduler_params = {"milestones": [150,250,300,350,400,450],'gamma':0.2},
                    scheduler_fn=torch.optim.lr_scheduler.MultiStepLR
                    )

In [None]:
from sklearn.impute import SimpleImputer

def run(fold, clf, train, test):
    df_train = train[train.kfold != fold]
    df_valid = train[train.kfold == fold]
    df_test = test.values
    del test
    
     
    X_train = df_train[features].values
    Y_train = df_train[target_features].values
    
    imp = SimpleImputer(missing_values = np.nan, strategy = 'mean')
    imp.fit(X_train)
    X_train = imp.transform(X_train)
    test = imp.transform(df_test)
    
    
    del df_train
    
    X_valid = df_valid[features].values
    Y_valid = df_valid[target_features].values
    X_valid = imp.transform(X_valid)
    
    del train, df_valid
    gc.collect()
    
    
    print("--------Training Begining for fold {}-------------".format(fold+1))
     
    clf.fit(X_train = X_train,
             y_train = Y_train,
             X_valid = X_valid,
             y_valid = Y_valid,
             max_epochs = 1000,
             patience =70)
    
    del X_train, Y_train
    
    return X_valid, Y_valid, test

In [None]:
X_valid, Y_valid, imputed_test = run(6, clf, train, test)
del test
gc.collect()

In [None]:
import pickle
with open('model.pkl', 'wb') as file:  
    pickle.dump(clf, file)

In [None]:
from IPython.display import FileLink
FileLink(r'model.pkl')

In [None]:
del X_valid, Y_valid
gc.collect()

In [None]:
predictions = clf.predict_proba(imputed_test)[:,1]

In [None]:
from numpy import savetxt
savetxt('predictions.csv', predictions,delimiter = ',')
FileLink(r'predictions.csv')