In [1]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm as tqdm
from pytorch_widedeep import Trainer
from pytorch_widedeep.preprocessing import WidePreprocessor, TabPreprocessor
from pytorch_widedeep.models import Wide, TabMlp, WideDeep
from pytorch_widedeep.metrics import Accuracy, F1Score
from pytorch_widedeep.datasets import load_adult
from torchmetrics import AveragePrecision, AUROC
from pytorch_widedeep.callbacks import EarlyStopping
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class GetData:
    def __init__(self, path, dset_name, name, missing_perc=0, n_rows=1, n_clms=1):
        self.mask = missing_perc
        dset_name = dset_name
        if self.mask:
            self.suffix = 'Missing{}_1'.format(self.mask)
        else:
            self.suffix = ''
        self.label = {'defaultCredit':'default.payment.next.month', 'bank':'y'}[dset_name]
        self.path = path
        self.dataset = name
        self.n_rows, self.n_clms = n_rows, n_clms
        self.embedding = name

    @property
    def data(self):
        if self.mask:
            return self.parse_mask(self._dataset)
        else:
            return self._dataset
    @property
    def embedding(self):
        return self._embedding
    
    @embedding.setter
    def embedding(self, name):
        fil_name = "{}/embedding.csv".format(self.path)
        if os.path.isfile(fil_name):
            self._embedding = pd.read_csv(fil_name)
        else:
            print("embedding dataset does not exist")
    
    @data.setter
    def dataset(self, name):

        fil_name = "{}/{}{}.csv".format(self.path, name, self.suffix)
        mask_fil_name = "{}/{}_{}.csv".format(self.path, name, self.suffix)
        if os.path.isfile(fil_name) and not self.mask:
            self._dataset = pd.read_csv(fil_name)
            self.n_rows, self.n_clms = self._dataset.shape[0], self._dataset.shape[1]
        elif self.mask and os.path.isfile(mask_fil_name):
            self._dataset = pd.read_csv(mask_fil_name)

    def parse_mask(self, df):

        df = df.groupby('row')['col'].apply(list)
        mask = []
        for i in range(self.n_rows):
            clm_indices = np.array(df[i] if i in df.index else [])
            mask_i = np.ones(self.n_clms, dtype=np.float64)
            if len(clm_indices):
                np.put(mask_i, clm_indices, np.nan)
            mask += [mask_i]

        return np.stack(mask, axis=0)

In [3]:
data_name = 'defaultCredit'
model_name = 'iwae'
fold = 0
PATH_train= "/media/6TB_Volume/DataRepo/small_datasets/{}/fold{}/train".format(data_name, fold)
PATH_test = "/media/6TB_Volume/DataRepo/small_datasets/{}/fold{}/test".format(data_name, fold)

original_data = GetData(PATH_train, data_name, 'normalized_data')
embedding_data = GetData("{}/{}".format(PATH_train, model_name), data_name, 'normalized_data')
label = original_data.label
df = original_data.data
# df_emb = embedding_data.embedding.drop(label, axis=1)
df_emb = embedding_data.embedding
df_train = df_emb.merge(df, right_index=True, left_index=True)

original_data = GetData(PATH_test, data_name, 'normalized_data')
embedding_data = GetData("{}/{}".format(PATH_test, model_name), data_name, 'normalized_data')
test_df = original_data.data
# df_emb = embedding_data.embedding.drop(label, axis=1)
df_emb = embedding_data.embedding
df_test = df_emb.merge(test_df, right_index=True, left_index=True)
# train_df, test_df = train_test_split(train_df, test_size=0.20, random_state=42)
df_train.head()

embedding dataset does not exist
embedding dataset does not exist


Unnamed: 0,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,emb_7,emb_8,emb_9,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,0.15591,-0.792147,0.81527,0.68383,1.597049,0.066575,-1.752827,-0.129553,-0.486362,-0.372769,...,-39980.9,-36665.01,-36393.754,1149.0,2140.0,3313.0,3750.0,2491.0,9785.0,0.0
1,1.206568,1.165155,0.50521,-0.219921,-0.38264,-0.888843,1.307427,1.233807,-0.604676,-0.713044,...,85407.1,91113.99,96357.25,9000.0,7027.0,5000.0,5000.0,6000.0,5000.0,0.0
2,-0.259755,-1.481801,-0.975531,-1.255872,-0.939124,-0.48143,-0.677546,-1.19671,-1.595688,0.160315,...,-22316.9,-17128.012,-16247.754,2900.0,700.0,1000.0,2970.0,1e-20,1e-20,0.0
3,2.855731,-0.069268,0.471339,0.903956,-0.652457,-1.678368,1.792997,-0.171269,0.567137,-0.222312,...,-12988.899,-10260.012,-38323.754,1655.0,1797.0,750.0,1e-20,561.0,500.0,0.0
4,-0.798088,0.24937,-0.20112,-2.998457,0.388763,-3.267331,0.053656,3.127441,0.268455,-0.324314,...,27453.1,27738.988,30930.246,5000.0,4792.0,3000.0,3019.0,8010.0,7000.0,1.0


In [4]:
emb_clms = [c for c in df_train.columns if 'emb' in c] 
# df_train = df_train[emb_clms + [label]]
# df_test = df_test[emb_clms + [label]]

In [5]:
# Define the 'column set up'
wide_cols = [
    "SEX",
    "EDUCATION",
    "MARRIAGE",
    "PAY_0",
    "PAY_2",
    "PAY_3",
    "PAY_4",
    "PAY_5",
    "PAY_6"
]

cat_embed_cols = [
    "SEX",
    "EDUCATION",
    "MARRIAGE",
    "PAY_0",
    "PAY_2",
    "PAY_3",
    "PAY_4",
    "PAY_5",
    "PAY_6"
]
continuous_cols = ["LIMIT_BAL", "BILL_AMT1", "BILL_AMT1", "BILL_AMT2", \
     "BILL_AMT3", "BILL_AMT4", "BILL_AMT5", "BILL_AMT6", 'PAY_AMT1', 'PAY_AMT1',\
        'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6'] + emb_clms
target = "default.payment.next.month"
target = df_train[target].values

In [6]:
wide_preprocessor = WidePreprocessor(wide_cols=wide_cols)
X_wide = wide_preprocessor.fit_transform(df_train)

tab_preprocessor = TabPreprocessor(
    cat_embed_cols=cat_embed_cols, continuous_cols=continuous_cols  # type: ignore[arg-type]
)
X_tab = tab_preprocessor.fit_transform(df_train)

In [7]:
# build the model
wide = Wide(input_dim=np.unique(X_wide).shape[0], pred_dim=1)
tab_mlp = TabMlp(
    column_idx=tab_preprocessor.column_idx,
    cat_embed_input=tab_preprocessor.cat_embed_input,
    continuous_cols=continuous_cols,
)
model = WideDeep(wide=wide, deeptabular=tab_mlp)

In [8]:
# train and validate
trainer = Trainer(model, objective="binary", metrics=[AUROC(task='binary'), F1Score, AveragePrecision(task='binary')])
trainer.fit(
    X_wide=X_wide,
    X_tab=X_tab,
    target=target,
    n_epochs=30,
    batch_size=256,
)

epoch 1: 100%|██████████| 94/94 [00:01<00:00, 58.28it/s, loss=0.529, metrics={'BinaryAUROC': 0.6591, 'f1': 0.302, 'BinaryAveragePrecision': 0.3502}]  
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork(

In [9]:
# predict on test
X_wide_te = wide_preprocessor.transform(df_test)
X_tab_te = tab_preprocessor.transform(df_test)
preds = trainer.predict(X_wide=X_wide_te, X_tab=X_tab_te)

  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()



In [10]:
from sklearn.metrics import average_precision_score, roc_auc_score
pred_probs = trainer.predict_proba(X_wide=X_wide_te, X_tab=X_tab_te)
target = "default.payment.next.month"
y = df_test[target].values
print("ROC-AUC:{}".format(roc_auc_score(y, pred_probs[:, 1])))
print("PrecisionRecall-AUC:{}".format(average_precision_score(y, pred_probs[:, 1])))

  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()


ROC-AUC:0.7672862298332872
PrecisionRecall-AUC:0.5329961567372081





In [125]:
# prepare the data
wide_preprocessor = WidePreprocessor(wide_cols=wide_cols)
X_wide = wide_preprocessor.fit_transform(df_train)

tab_preprocessor = TabPreprocessor(
    cat_embed_cols=cat_embed_cols, continuous_cols=continuous_cols  # type: ignore[arg-type]
)
X_tab = tab_preprocessor.fit_transform(df_train)

KeyError: "['emb_0', 'emb_1', 'emb_2', 'emb_3', 'emb_4', 'emb_5', 'emb_6', 'emb_7', 'emb_8', 'emb_9', 'emb_10', 'emb_11', 'emb_12', 'emb_13', 'emb_14', 'emb_15', 'emb_16', 'emb_17'] not in index"

In [126]:
# build the model
wide = Wide(input_dim=np.unique(X_wide).shape[0], pred_dim=1)
tab_mlp = TabMlp(
    column_idx=tab_preprocessor.column_idx,
    cat_embed_input=tab_preprocessor.cat_embed_input,
    continuous_cols=continuous_cols,
)
model = WideDeep(wide=wide, deeptabular=tab_mlp)

AttributeError: 'TabPreprocessor' object has no attribute 'column_idx'

In [127]:
# train and validate
early_stop_callback = EarlyStopping(monitor=AveragePrecision, \
             min_delta=0.01, patience=3, verbose=False, mode="max")
trainer = Trainer(model, objective="binary", accelerator='gpu', max_epochs=30, \
                 min_epochs=1, callbacks=[early_stop_callback])
# trainer = Trainer(model, objective="binary", metrics=[AUROC, F1Score, AveragePrecision])
trainer.fit(
    X_wide=X_wide,
    X_tab=X_tab,
    target=target,
    n_epochs=30,
    batch_size=256,
)

epoch 1: 100%|██████████| 94/94 [00:00<00:00, 94.59it/s, loss=0.576] 
epoch 2: 100%|██████████| 94/94 [00:01<00:00, 93.93it/s, loss=0.526] 
epoch 3: 100%|██████████| 94/94 [00:01<00:00, 92.85it/s, loss=0.501] 
epoch 4: 100%|██████████| 94/94 [00:01<00:00, 93.84it/s, loss=0.486] 
epoch 5: 100%|██████████| 94/94 [00:01<00:00, 90.87it/s, loss=0.476] 
epoch 6: 100%|██████████| 94/94 [00:01<00:00, 92.61it/s, loss=0.469] 
epoch 7: 100%|██████████| 94/94 [00:01<00:00, 91.31it/s, loss=0.465] 
epoch 8: 100%|██████████| 94/94 [00:01<00:00, 89.69it/s, loss=0.461] 
epoch 9: 100%|██████████| 94/94 [00:01<00:00, 92.18it/s, loss=0.458] 
epoch 10: 100%|██████████| 94/94 [00:01<00:00, 88.61it/s, loss=0.455] 
epoch 11: 100%|██████████| 94/94 [00:01<00:00, 91.49it/s, loss=0.452] 
epoch 12: 100%|██████████| 94/94 [00:00<00:00, 94.49it/s, loss=0.45]  
epoch 13: 100%|██████████| 94/94 [00:00<00:00, 94.19it/s, loss=0.448] 
epoch 14: 100%|██████████| 94/94 [00:01<00:00, 92.21it/s, loss=0.447] 
epoch 15: 100%|

In [163]:
# predict on test
X_wide_te = wide_preprocessor.transform(df_test)
X_tab_te = tab_preprocessor.transform(df_test)
preds = trainer.predict(X_wide=X_wide_te, X_tab=X_tab_te)

KeyError: "None of [Index(['SEX', 'EDUCATION', 'MARRIAGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4',\n       'PAY_5', 'PAY_6'],\n      dtype='object')] are in the [columns]"

In [107]:
pred_probs = trainer.predict_proba(X_wide=X_wide_te, X_tab=X_tab_te)

predict: 100%|██████████| 24/24 [00:00<00:00, 42.36it/s]


In [108]:
from sklearn.metrics import average_precision_score, roc_auc_score
target = "default.payment.next.month"
y = df_test[target].values
print("ROC-AUC:{}".format(roc_auc_score(y, pred_probs[:, 1])))
print("PrecisionRecall-AUC:{}".format(average_precision_score(y, pred_probs[:, 1])))

ROC-AUC:0.7577691069442939
PrecisionRecall-AUC:0.5138075451045618


In [48]:
from pytorch_widedeep import Tab2Vec
t2v = Tab2Vec(model=model, tab_preprocessor=tab_preprocessor)
X_vec, y = t2v.transform(df_train, target_col="default.payment.next.month")

In [50]:
X_vec

array([[-2.0625865 , -0.79887253,  1.4173131 , ..., -0.1364332 ,
        -0.18054675,  0.11468483],
       [ 0.4391028 ,  0.02701583,  1.4173131 , ...,  0.01621273,
        -0.0554886 , -0.16666806],
       [-2.0625865 , -0.79887253,  0.5986612 , ..., -0.11623305,
        -0.05020997, -0.08599467],
       ...,
       [-2.0625865 , -0.79887253,  1.4173131 , ...,  0.1665693 ,
         0.3082162 ,  0.1384262 ],
       [-2.0625865 , -0.79887253,  1.4173131 , ...,  0.48647264,
         0.09348635, -0.18639345],
       [-2.0625865 , -0.79887253,  0.        , ..., -0.17750688,
         0.5486876 , -0.14032812]], dtype=float32)

In [22]:
X_vec, y = t2v.transform(df_test, target_col="income_label")

In [23]:
X_vec.shape

(9769, 95)

In [24]:
y

array([0, 0, 1, ..., 0, 0, 0])