In [1]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm as tqdm
from pytorch_widedeep import Trainer
from pytorch_widedeep.preprocessing import WidePreprocessor, TabPreprocessor
from pytorch_widedeep.models import Wide, TabMlp, WideDeep
from pytorch_widedeep.metrics import Accuracy, F1Score
from pytorch_widedeep.datasets import load_adult
from pytorch_widedeep.models import FTTransformer
import warnings
from torchmetrics import AveragePrecision, AUROC
warnings.filterwarnings("ignore", category=ResourceWarning, message="unclosed.*<zmq.*>")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
lukup = {'defaultCredit':'default.payment.next.month', 'bank':'y'}
name = 'defaultCredit'
label = lukup[name]
fold = 1
train_df = pd.read_csv('/home/vineeth/Documents/GitWorkSpace/PytorchRecipes/SimpleMLP/Dataset/{}/fold{}/train/data.csv'.format(name, fold))
valid_df = pd.read_csv('/home/vineeth/Documents/GitWorkSpace/PytorchRecipes/SimpleMLP/Dataset/{}/fold{}/valid/data.csv'.format(name, fold))
test_df = pd.read_csv('/home/vineeth/Documents/GitWorkSpace/PytorchRecipes/SimpleMLP/Dataset/{}/fold{}/test/data.csv'.format(name, fold))
train_df.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,-0.134759,0,1,1,-1.029047,2,1,1,1,1,...,-0.428605,-0.369043,-0.239816,0.503316,-0.03998,-0.012818,0.194622,0.536758,-0.180878,0
1,1.483795,1,0,1,1.357652,2,1,1,1,1,...,0.579916,-0.525994,-0.512933,-0.070252,-0.126784,2.543032,0.228134,0.230763,0.436037,0
2,-0.674276,1,0,1,-0.378129,0,1,1,1,1,...,0.37445,0.561493,0.571863,-0.160815,-0.083165,-0.15481,0.330267,-0.314136,-0.012122,1
3,-0.75135,0,2,0,1.249166,2,1,1,1,1,...,0.425778,0.504203,0.545752,-0.14874,-0.109423,-0.132091,-0.132522,-0.107827,-0.141502,0
4,-0.365981,1,1,1,0.598248,2,1,1,1,1,...,1.126163,1.336657,1.378068,0.020312,-0.061681,-0.041216,0.196217,-0.117776,-0.067081,0


In [3]:
# Define the 'column set up'
wide_cols = [
    "SEX",
    "EDUCATION",
    "MARRIAGE",
    "PAY_0",
    "PAY_2",
    "PAY_3",
    "PAY_4",
    "PAY_5",
    "PAY_6"
]

cat_embed_cols = [
    "SEX",
    "EDUCATION",
    "MARRIAGE",
    "PAY_0",
    "PAY_2",
    "PAY_3",
    "PAY_4",
    "PAY_5",
    "PAY_6"
]
continuous_cols = ["LIMIT_BAL", "BILL_AMT1", "BILL_AMT1", "BILL_AMT2", \
     "BILL_AMT3", "BILL_AMT4", "BILL_AMT5", "BILL_AMT6", 'PAY_AMT1', 'PAY_AMT1',\
        'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']
target = "default.payment.next.month"
target = train_df[target].values

In [4]:
# prepare the data
wide_preprocessor = WidePreprocessor(wide_cols=wide_cols)
X_wide = wide_preprocessor.fit_transform(train_df)

tab_preprocessor = TabPreprocessor(
    cat_embed_cols=cat_embed_cols, continuous_cols=continuous_cols  # type: ignore[arg-type]
)
X_tab = tab_preprocessor.fit_transform(train_df)

In [5]:
wide = Wide(input_dim=np.unique(X_wide).shape[0], pred_dim=1)
# Define the FTTransformer model
cat_embed_input = tab_preprocessor.cat_embed_input
column_idx = tab_preprocessor.column_idx
deeptabular = FTTransformer(
    column_idx=tab_preprocessor.column_idx,
    cat_embed_input=cat_embed_input,
    continuous_cols=continuous_cols,
    n_heads=8, 
    n_blocks=6,
    use_qkv_bias=True, 
    attn_dropout=0.1, 
    ff_dropout=0.1,
)

model = WideDeep(wide=wide, deeptabular=deeptabular)

In [6]:
model

WideDeep(
  (wide): Wide(
    (wide_linear): Embedding(77, 1, padding_idx=0)
  )
  (deeptabular): Sequential(
    (0): FTTransformer(
      (cat_and_cont_embed): SameSizeCatAndContEmbeddings(
        (cat_embed): SameSizeCatEmbeddings(
          (embed): Embedding(77, 64, padding_idx=0)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (cont_norm): Identity()
        (cont_embed): ContEmbeddings(15, 64, embed_dropout=0.1, use_bias=True)
      )
      (encoder): Sequential(
        (fttransformer_block0): FTTransformerEncoder(
          (attn): LinearAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (qkv_proj): Linear(in_features=64, out_features=192, bias=True)
            (out_proj): Linear(in_features=64, out_features=64, bias=True)
          )
          (ff): FeedForward(
            (w_1): Linear(in_features=64, out_features=170, bias=True)
            (w_2): Linear(in_features=85, out_features=64, bias=True)
            (dropout): Dropo

In [7]:
# train and validate
trainer = Trainer(model, objective="binary", accelerator="gpu",\
                  metrics=[AUROC(task='binary'), F1Score, AveragePrecision(task='binary')])
trainer.fit(
    X_wide=X_wide,
    X_tab=X_tab,
    target=target,
    n_epochs=100,
    batch_size=256,
)

epoch 1: 100%|██████████| 85/85 [00:02<00:00, 29.62it/s, loss=0.486, metrics={'BinaryAUROC': 0.7109, 'f1': 0.4196, 'BinaryAveragePrecision': 0.4553}]
epoch 2: 100%|██████████| 85/85 [00:02<00:00, 36.10it/s, loss=0.443, metrics={'BinaryAUROC': 0.7567, 'f1': 0.4538, 'BinaryAveragePrecision': 0.5212}]
epoch 3: 100%|██████████| 85/85 [00:02<00:00, 35.17it/s, loss=0.439, metrics={'BinaryAUROC': 0.7642, 'f1': 0.4676, 'BinaryAveragePrecision': 0.5329}]
epoch 4: 100%|██████████| 85/85 [00:02<00:00, 36.25it/s, loss=0.436, metrics={'BinaryAUROC': 0.7686, 'f1': 0.4699, 'BinaryAveragePrecision': 0.5378}]
epoch 5: 100%|██████████| 85/85 [00:02<00:00, 37.04it/s, loss=0.434, metrics={'BinaryAUROC': 0.7705, 'f1': 0.4749, 'BinaryAveragePrecision': 0.5408}]
epoch 6: 100%|██████████| 85/85 [00:02<00:00, 36.62it/s, loss=0.433, metrics={'BinaryAUROC': 0.7733, 'f1': 0.4795, 'BinaryAveragePrecision': 0.5464}]
epoch 7: 100%|██████████| 85/85 [00:02<00:00, 36.09it/s, loss=0.432, metrics={'BinaryAUROC': 0.7732,

In [8]:
# predict on test
X_wide_te = wide_preprocessor.transform(test_df)
X_tab_te = tab_preprocessor.transform(test_df)
preds = trainer.predict(X_wide=X_wide_te, X_tab=X_tab_te)
pred_probs = trainer.predict_proba(X_wide=X_wide_te, X_tab=X_tab_te)

predict: 100%|██████████| 24/24 [00:00<00:00, 33.52it/s]
predict: 100%|██████████| 24/24 [00:00<00:00, 34.31it/s]


In [9]:
from sklearn.metrics import average_precision_score, roc_auc_score
target = lukup[name]
y = test_df[target].values
print("ROC-AUC:{}".format(roc_auc_score(y, pred_probs[:, 1])))
print("PrecisionRecall-AUC:{}".format(average_precision_score(y, pred_probs[:, 1])))

ROC-AUC:0.6923589337984494
PrecisionRecall-AUC:0.4104363421893267


In [11]:
from pytorch_widedeep import Tab2Vec
t2v = Tab2Vec(model=model, tab_preprocessor=tab_preprocessor)
X_vec, y = t2v.transform(train_df, target_col=target)

In [12]:
X_vec

array([[-1.4172393e-01, -2.9816014e-01,  2.1386049e+00, ...,
         5.1322991e-01,  8.6423665e-01, -3.0246025e-03],
       [-1.8721935e+00,  8.3828169e-01, -4.5036829e-01, ...,
         5.5625874e-01,  4.9738172e-01,  8.4056890e-01],
       [-1.8721935e+00,  8.3828169e-01, -4.5036829e-01, ...,
         6.8739414e-01, -1.5589465e-01,  2.2773863e-01],
       ...,
       [-1.4172393e-01, -2.9816014e-01, -4.5036829e-01, ...,
        -5.1062021e-02, -8.0248013e-02, -8.1868723e-02],
       [-1.8721935e+00,  8.3828169e-01,  2.1386049e+00, ...,
         3.1717189e-02,  1.0486166e-03, -3.0246025e-03],
       [-1.8721935e+00,  8.3828169e-01, -4.5036829e-01, ...,
        -6.6634342e-02, -7.7423006e-02, -9.5329911e-02]], dtype=float32)