In [3]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split

from pytorch_widedeep import Trainer
from pytorch_widedeep.preprocessing import WidePreprocessor, TabPreprocessor
from pytorch_widedeep.models import Wide, TabMlp, WideDeep
from pytorch_widedeep.metrics import Accuracy
from pytorch_widedeep.datasets import load_adult

In [10]:
df = load_adult(as_frame=True)
df["income_label"] = (df["income"].apply(lambda x: ">50K" in x)).astype(int)
df.drop("income", axis=1, inplace=True)
df_train, df_test = train_test_split(df, test_size=0.2, stratify=df.income_label)

In [2]:
df_test.head()

NameError: name 'df_test' is not defined

In [12]:
# Define the 'column set up'
wide_cols = [
    "education",
    "relationship",
    "workclass",
    "occupation",
    "native-country",
    "gender",
]
crossed_cols = [("education", "occupation"), ("native-country", "occupation")]

cat_embed_cols = [
    "workclass",
    "education",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "gender",
    "capital-gain",
    "capital-loss",
    "native-country",
]
continuous_cols = ["age", "hours-per-week"]
target = "income_label"
target = df_train[target].values

In [13]:
# prepare the data
wide_preprocessor = WidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols)
X_wide = wide_preprocessor.fit_transform(df_train)

tab_preprocessor = TabPreprocessor(
    cat_embed_cols=cat_embed_cols, continuous_cols=continuous_cols  # type: ignore[arg-type]
)
X_tab = tab_preprocessor.fit_transform(df_train)

In [14]:
# build the model
wide = Wide(input_dim=np.unique(X_wide).shape[0], pred_dim=1)
tab_mlp = TabMlp(
    column_idx=tab_preprocessor.column_idx,
    cat_embed_input=tab_preprocessor.cat_embed_input,
    continuous_cols=continuous_cols,
)
model = WideDeep(wide=wide, deeptabular=tab_mlp)

In [15]:
# train and validate
trainer = Trainer(model, objective="binary", metrics=[Accuracy])
trainer.fit(
    X_wide=X_wide,
    X_tab=X_tab,
    target=target,
    n_epochs=5,
    batch_size=256,
)

epoch 1: 100%|██████████| 153/153 [00:01<00:00, 120.14it/s, loss=0.434, metrics={'acc': 0.7975}]
epoch 2: 100%|██████████| 153/153 [00:01<00:00, 122.77it/s, loss=0.349, metrics={'acc': 0.837}] 
epoch 3: 100%|██████████| 153/153 [00:01<00:00, 120.08it/s, loss=0.324, metrics={'acc': 0.8496}]
epoch 4: 100%|██████████| 153/153 [00:01<00:00, 122.70it/s, loss=0.311, metrics={'acc': 0.8543}]
epoch 5: 100%|██████████| 153/153 [00:01<00:00, 118.86it/s, loss=0.303, metrics={'acc': 0.8595}]


In [16]:
# predict on test
X_wide_te = wide_preprocessor.transform(df_test)
X_tab_te = tab_preprocessor.transform(df_test)
preds = trainer.predict(X_wide=X_wide_te, X_tab=X_tab_te)

predict: 100%|██████████| 39/39 [00:00<00:00, 78.77it/s]


In [1]:
df_test.head()

NameError: name 'df_test' is not defined

In [17]:
pred_probs = trainer.predict_proba(X_wide=X_wide_te, X_tab=X_tab_te)

predict: 100%|██████████| 39/39 [00:00<00:00, 82.94it/s]


In [18]:
from sklearn.metrics import average_precision_score, roc_auc_score
target = "income_label"
y = df_test[target].values
print("ROC-AUC:{}".format(roc_auc_score(y, pred_probs[:, 1])))
print("PrecisionRecall-AUC:{}".format(average_precision_score(y, pred_probs[:, 1])))

ROC-AUC:0.9228221853772127
PrecisionRecall-AUC:0.8247246872897009


In [19]:
from pytorch_widedeep import Tab2Vec
t2v = Tab2Vec(model=model, tab_preprocessor=tab_preprocessor)
X_vec, y = t2v.transform(df_train, target_col="income_label")

In [21]:
X_vec.shape

(39073, 95)

In [22]:
X_vec, y = t2v.transform(df_test, target_col="income_label")

In [23]:
X_vec.shape

(9769, 95)

In [24]:
y

array([0, 0, 1, ..., 0, 0, 0])