In [1]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm as tqdm
from pytorch_widedeep import Trainer
from pytorch_widedeep.preprocessing import WidePreprocessor, TabPreprocessor
from pytorch_widedeep.models import Wide, TabMlp, WideDeep, TabResnet
from pytorch_widedeep.metrics import Accuracy, F1Score
from pytorch_widedeep.datasets import load_adult
import warnings
from torchmetrics import AveragePrecision, AUROC
warnings.filterwarnings("ignore", category=ResourceWarning, message="unclosed.*<zmq.*>")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
lukup = {'defaultCredit':'default.payment.next.month', 'bank':'y'}
name = 'defaultCredit'
label = lukup[name]
fold = 0
train_df = pd.read_csv('/home/vineeth/Documents/GitWorkSpace/PytorchRecipes/TabularDataModels/Dataset/{}/fold{}/train/data.csv'.format(name, fold))
valid_df = pd.read_csv('/home/vineeth/Documents/GitWorkSpace/PytorchRecipes/TabularDataModels/Dataset/{}/fold{}/valid/data.csv'.format(name, fold))
test_df = pd.read_csv('/home/vineeth/Documents/GitWorkSpace/PytorchRecipes/TabularDataModels/Dataset/{}/fold{}/test/data.csv'.format(name, fold))
train_df.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,-0.365981,0,0,0,-0.812074,4,0,1,1,1,...,0.275557,0.356085,0.442651,-0.341942,-0.126784,-0.126411,-0.116564,0.013131,-0.293382,1
1,-0.674276,0,0,1,-1.24602,1,2,0,0,2,...,-0.588044,-0.62929,-0.555651,-0.257657,0.556969,0.012684,-0.176631,0.065363,-0.119114,0
2,-1.13672,1,0,1,1.249166,2,1,1,1,1,...,-0.419745,-0.403667,-0.386071,-0.251378,-0.083382,-0.205927,-0.276146,-0.281409,-0.270881,0
3,-0.905498,1,0,0,0.055816,2,1,1,1,1,...,0.091853,-0.499282,-0.484068,-0.221191,-0.161505,-0.231599,-0.276146,-0.281409,-0.272231,1
4,0.250611,1,1,1,-0.812074,3,3,3,2,0,...,-0.667849,-0.65814,-0.647804,-0.341942,-0.25699,-0.279762,-0.288913,-0.294893,-0.293382,0


In [3]:
# Define the 'column set up'
wide_cols = [
    "SEX",
    "EDUCATION",
    "MARRIAGE",
    "PAY_0",
    "PAY_2",
    "PAY_3",
    "PAY_4",
    "PAY_5",
    "PAY_6"
]

cat_embed_cols = [
    "SEX",
    "EDUCATION",
    "MARRIAGE",
    "PAY_0",
    "PAY_2",
    "PAY_3",
    "PAY_4",
    "PAY_5",
    "PAY_6"
]
continuous_cols = ["LIMIT_BAL", "BILL_AMT1", "BILL_AMT1", "BILL_AMT2", \
     "BILL_AMT3", "BILL_AMT4", "BILL_AMT5", "BILL_AMT6", 'PAY_AMT1', 'PAY_AMT1',\
        'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']
target = "default.payment.next.month"
target = train_df[target].values

In [4]:
# prepare the data
wide_preprocessor = WidePreprocessor(wide_cols=wide_cols)
X_wide = wide_preprocessor.fit_transform(train_df)

tab_preprocessor = TabPreprocessor(
    cat_embed_cols=cat_embed_cols, continuous_cols=continuous_cols  # type: ignore[arg-type]
)
X_tab = tab_preprocessor.fit_transform(train_df)

In [5]:
X_tab[:,-1]

array([-0.30413785, -0.12509114, -0.28102013, ..., -0.07244054,
       -0.0151664 , -0.0151664 ])

In [6]:
tab_preprocessor.column_idx

{'SEX': 0,
 'EDUCATION': 1,
 'MARRIAGE': 2,
 'PAY_0': 3,
 'PAY_2': 4,
 'PAY_3': 5,
 'PAY_4': 6,
 'PAY_5': 7,
 'PAY_6': 8,
 'LIMIT_BAL': 9,
 'BILL_AMT1': 11,
 'BILL_AMT2': 12,
 'BILL_AMT3': 13,
 'BILL_AMT4': 14,
 'BILL_AMT5': 15,
 'BILL_AMT6': 16,
 'PAY_AMT1': 18,
 'PAY_AMT2': 19,
 'PAY_AMT3': 20,
 'PAY_AMT4': 21,
 'PAY_AMT5': 22,
 'PAY_AMT6': 23}

In [7]:
# build the model
wide = Wide(input_dim=np.unique(X_wide).shape[0], pred_dim=1)
# tab_mlp = TabMlp(
#     column_idx=tab_preprocessor.column_idx,
#     cat_embed_input=tab_preprocessor.cat_embed_input,
#     continuous_cols=continuous_cols,
#     mlp_hidden_dims=[400, 200],
#     mlp_dropout=0.5,
#     mlp_activation="leaky_relu",
#     embed_continuous=True,
#     mlp_batchnorm=True
# )

tab_mlp = TabResnet(
    column_idx=tab_preprocessor.column_idx,
    cat_embed_input=tab_preprocessor.cat_embed_input,
    continuous_cols=continuous_cols,
    mlp_hidden_dims=[400, 200],
    mlp_dropout=0.5,
    mlp_activation="leaky_relu"
)
model = WideDeep(wide=wide, deeptabular=tab_mlp)

In [8]:
tab_mlp

TabResnet(
  (cat_and_cont_embed): DiffSizeCatAndContEmbeddings(
    (cat_embed): DiffSizeCatEmbeddings(
      (embed_layers): ModuleDict(
        (emb_layer_SEX): Embedding(3, 2, padding_idx=0)
        (emb_layer_EDUCATION): Embedding(8, 5, padding_idx=0)
        (emb_layer_MARRIAGE): Embedding(5, 3, padding_idx=0)
        (emb_layer_PAY_0): Embedding(12, 6, padding_idx=0)
        (emb_layer_PAY_2): Embedding(12, 6, padding_idx=0)
        (emb_layer_PAY_3): Embedding(12, 6, padding_idx=0)
        (emb_layer_PAY_4): Embedding(12, 6, padding_idx=0)
        (emb_layer_PAY_5): Embedding(11, 6, padding_idx=0)
        (emb_layer_PAY_6): Embedding(11, 6, padding_idx=0)
      )
      (embedding_dropout): Dropout(p=0.1, inplace=False)
    )
    (cont_norm): BatchNorm1d(15, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (encoder): DenseResnet(
    (dense_resnet): Sequential(
      (lin_inp): Linear(in_features=61, out_features=200, bias=False)
      (bn_inp): BatchNorm1d(

In [54]:
# train and validate
trainer = Trainer(model, objective="binary", accelerator="gpu",\
                  metrics=[AUROC(task='binary'), F1Score, AveragePrecision(task='binary')])
trainer.fit(
    X_wide=X_wide,
    X_tab=X_tab,
    target=target,
    n_epochs=100,
    batch_size=256,
)

epoch 1: 100%|██████████| 85/85 [00:01<00:00, 69.68it/s, loss=0.541, metrics={'BinaryAUROC': 0.6305, 'f1': 0.2789, 'BinaryAveragePrecision': 0.3552}] 
epoch 2: 100%|██████████| 85/85 [00:01<00:00, 74.67it/s, loss=0.496, metrics={'BinaryAUROC': 0.6896, 'f1': 0.339, 'BinaryAveragePrecision': 0.4197}]  
epoch 3: 100%|██████████| 85/85 [00:01<00:00, 75.74it/s, loss=0.482, metrics={'BinaryAUROC': 0.7071, 'f1': 0.3467, 'BinaryAveragePrecision': 0.4441}] 
epoch 4: 100%|██████████| 85/85 [00:01<00:00, 79.55it/s, loss=0.472, metrics={'BinaryAUROC': 0.7229, 'f1': 0.369, 'BinaryAveragePrecision': 0.4617}]  
epoch 5: 100%|██████████| 85/85 [00:01<00:00, 71.66it/s, loss=0.467, metrics={'BinaryAUROC': 0.7275, 'f1': 0.3702, 'BinaryAveragePrecision': 0.4706}] 
epoch 6: 100%|██████████| 85/85 [00:01<00:00, 74.80it/s, loss=0.462, metrics={'BinaryAUROC': 0.7353, 'f1': 0.3772, 'BinaryAveragePrecision': 0.4793}] 
epoch 7: 100%|██████████| 85/85 [00:01<00:00, 74.88it/s, loss=0.457, metrics={'BinaryAUROC': 0

In [55]:
# predict on test
X_wide_te = wide_preprocessor.transform(test_df)
X_tab_te = tab_preprocessor.transform(test_df)
preds = trainer.predict(X_wide=X_wide_te, X_tab=X_tab_te)
pred_probs = trainer.predict_proba(X_wide=X_wide_te, X_tab=X_tab_te)

predict: 100%|██████████| 24/24 [00:00<00:00, 49.16it/s]
predict: 100%|██████████| 24/24 [00:00<00:00, 45.31it/s]


In [56]:
from sklearn.metrics import average_precision_score, roc_auc_score
target = lukup[name]
y = test_df[target].values
print("ROC-AUC:{}".format(roc_auc_score(y, pred_probs[:, 1])))
print("PrecisionRecall-AUC:{}".format(average_precision_score(y, pred_probs[:, 1])))

ROC-AUC:0.7925768688745115
PrecisionRecall-AUC:0.5728320749314739


In [11]:
from pytorch_widedeep import Tab2Vec
t2v = Tab2Vec(model=model, tab_preprocessor=tab_preprocessor)
X_vec, y = t2v.transform(train_df, target_col=target)

In [12]:
X_vec

array([[-1.4172393e-01, -2.9816014e-01,  2.1386049e+00, ...,
         5.1322991e-01,  8.6423665e-01, -3.0246025e-03],
       [-1.8721935e+00,  8.3828169e-01, -4.5036829e-01, ...,
         5.5625874e-01,  4.9738172e-01,  8.4056890e-01],
       [-1.8721935e+00,  8.3828169e-01, -4.5036829e-01, ...,
         6.8739414e-01, -1.5589465e-01,  2.2773863e-01],
       ...,
       [-1.4172393e-01, -2.9816014e-01, -4.5036829e-01, ...,
        -5.1062021e-02, -8.0248013e-02, -8.1868723e-02],
       [-1.8721935e+00,  8.3828169e-01,  2.1386049e+00, ...,
         3.1717189e-02,  1.0486166e-03, -3.0246025e-03],
       [-1.8721935e+00,  8.3828169e-01, -4.5036829e-01, ...,
        -6.6634342e-02, -7.7423006e-02, -9.5329911e-02]], dtype=float32)