In [11]:
import statsmodels.api as sm
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import category_encoders as ce
import torch
import torch.nn as nn
from rtdl_num_embeddings import (
    LinearReLUEmbeddings,
    PeriodicEmbeddings,
    PiecewiseLinearEncoding,
    PiecewiseLinearEmbeddings,
    compute_bins,
)

RANDOM_SEED = 42


df = pd.read_csv("data/adult.csv")

# Transform the target variable to 0,1

mapping = {"<=50K": 0, ">50K": 1}
df["income"]  = df["income"].map(mapping)

X = df.drop("income",axis=1)
y = df["income"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)
X_train


Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country
37193,42,Private,145175,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States
31093,52,Self-emp-not-inc,175029,10th,6,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,35,United-States
33814,34,Local-gov,172664,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States
14500,28,Private,125791,HS-grad,9,Never-married,Adm-clerical,Not-in-family,White,Female,0,0,40,United-States
23399,46,Private,28419,Assoc-voc,11,Never-married,Transport-moving,Not-in-family,White,Male,0,0,50,United-States
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11284,36,Private,635913,HS-grad,9,Married-spouse-absent,Other-service,Not-in-family,Black,Male,0,0,40,United-States
44732,34,Private,107624,Some-college,10,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,50,United-States
38158,28,Private,250135,Some-college,10,Divorced,Exec-managerial,Not-in-family,White,Female,0,0,40,United-States
860,46,State-gov,96652,Assoc-voc,11,Separated,Adm-clerical,Unmarried,Black,Female,0,0,40,United-States


In [None]:
numeric_features = X_train.select_dtypes(include=['float64', 'float32', 'float16', 'int64', 'int32', 'int16', 'int8', 'uint8'])
numeric_features

In [13]:
num_tensor = torch.tensor(numeric_features.to_numpy(), dtype=torch.float32)
num_tensor

tensor([[4.2000e+01, 1.4518e+05, 1.0000e+01, 0.0000e+00, 0.0000e+00, 4.0000e+01],
        [5.2000e+01, 1.7503e+05, 6.0000e+00, 0.0000e+00, 0.0000e+00, 3.5000e+01],
        [3.4000e+01, 1.7266e+05, 9.0000e+00, 0.0000e+00, 0.0000e+00, 4.0000e+01],
        ...,
        [2.8000e+01, 2.5014e+05, 1.0000e+01, 0.0000e+00, 0.0000e+00, 4.0000e+01],
        [4.6000e+01, 9.6652e+04, 1.1000e+01, 0.0000e+00, 0.0000e+00, 4.0000e+01],
        [5.9000e+01, 1.7612e+05, 1.4000e+01, 0.0000e+00, 0.0000e+00, 7.0000e+00]])

In [21]:
# NOTE: pip install rtdl_revisiting_models
from rtdl_revisiting_models import MLP

batch_size = numeric_features.shape[0]
n_cont_features = numeric_features.shape[1]
#x = torch.randn(batch_size, n_cont_features)


In [26]:
mlp_config = {
    'd_out': 1,  # For example, a single regression task.
    'n_blocks': 2,
    'd_block': 256,
    'dropout': 0.1,
}

#And this is how MLP with embeddings for continuous features can be created

d_embedding = 8

#number of continuous features as params
m_cont_embeddings = PeriodicEmbeddings(n_cont_features,d_embedding=d_embedding, lite=False)

model_with_embeddings = nn.Sequential(
    # Input shape: (batch_size, n_cont_features)

    m_cont_embeddings,
    # After embeddings: (batch_size, n_cont_features, d_embedding)

    # `nn.Flatten` is not needed for Transformer-like architectures. (car traite features par features en vecteurs)
    nn.Flatten(),

    # After flattening: (batch_size, n_cont_features * d_embedding)

    MLP(d_in=n_cont_features * d_embedding, **mlp_config)
    # The final shape: (batch_size, d_out)
)
# The usage is the same as for the model without embeddings:
y_pred = model_with_embeddings(num_tensor)

In [27]:
y_pred

tensor([[-0.0306],
        [-0.0385],
        [-0.0308],
        ...,
        [-0.0105],
        [-0.0089],
        [-0.0042]], grad_fn=<AddmmBackward0>)