In [1]:
import statsmodels.api as sm
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import category_encoders as ce
import torch
import torch.nn as nn
from rtdl_num_embeddings import (
    LinearReLUEmbeddings,
    PeriodicEmbeddings,
    PiecewiseLinearEncoding,
    PiecewiseLinearEmbeddings,
    compute_bins,
)

RANDOM_SEED = 42


df = pd.read_csv("data/adult.csv")

# Transform the target variable to 0,1

mapping = {"<=50K": 0, ">50K": 1}
df["income"]  = df["income"].map(mapping)

X = df.drop("income",axis=1)
y = df["income"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)
X_train


Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country
37193,42,Private,145175,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States
31093,52,Self-emp-not-inc,175029,10th,6,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,35,United-States
33814,34,Local-gov,172664,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States
14500,28,Private,125791,HS-grad,9,Never-married,Adm-clerical,Not-in-family,White,Female,0,0,40,United-States
23399,46,Private,28419,Assoc-voc,11,Never-married,Transport-moving,Not-in-family,White,Male,0,0,50,United-States
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11284,36,Private,635913,HS-grad,9,Married-spouse-absent,Other-service,Not-in-family,Black,Male,0,0,40,United-States
44732,34,Private,107624,Some-college,10,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,50,United-States
38158,28,Private,250135,Some-college,10,Divorced,Exec-managerial,Not-in-family,White,Female,0,0,40,United-States
860,46,State-gov,96652,Assoc-voc,11,Separated,Adm-clerical,Unmarried,Black,Female,0,0,40,United-States


In [2]:
numeric_features = X_train.select_dtypes(include=['float64', 'float32', 'float16', 'int64', 'int32', 'int16', 'int8', 'uint8'])
numeric_features

Unnamed: 0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week
37193,42,145175,10,0,0,40
31093,52,175029,6,0,0,35
33814,34,172664,9,0,0,40
14500,28,125791,9,0,0,40
23399,46,28419,11,0,0,50
...,...,...,...,...,...,...
11284,36,635913,9,0,0,40
44732,34,107624,10,0,0,50
38158,28,250135,10,0,0,40
860,46,96652,11,0,0,40


In [3]:
num_tensor = torch.tensor(numeric_features.to_numpy(), dtype=torch.float32)
num_tensor

tensor([[4.2000e+01, 1.4518e+05, 1.0000e+01, 0.0000e+00, 0.0000e+00, 4.0000e+01],
        [5.2000e+01, 1.7503e+05, 6.0000e+00, 0.0000e+00, 0.0000e+00, 3.5000e+01],
        [3.4000e+01, 1.7266e+05, 9.0000e+00, 0.0000e+00, 0.0000e+00, 4.0000e+01],
        ...,
        [2.8000e+01, 2.5014e+05, 1.0000e+01, 0.0000e+00, 0.0000e+00, 4.0000e+01],
        [4.6000e+01, 9.6652e+04, 1.1000e+01, 0.0000e+00, 0.0000e+00, 4.0000e+01],
        [5.9000e+01, 1.7612e+05, 1.4000e+01, 0.0000e+00, 0.0000e+00, 7.0000e+00]])

In [4]:
# NOTE: pip install rtdl_revisiting_models
from rtdl_revisiting_models import MLP

batch_size = numeric_features.shape[0]
n_cont_features = numeric_features.shape[1]
#x = torch.randn(batch_size, n_cont_features)


In [5]:
mlp_config = {
    'd_out': 1,  # For example, a single regression task.
    'n_blocks': 2,
    'd_block': 256,
    'dropout': 0.1,
}

#And this is how MLP with embeddings for continuous features can be created

d_embedding = 8

#number of continuous features as params
m_cont_embeddings = PeriodicEmbeddings(n_cont_features,d_embedding=d_embedding, lite=False)

model_with_embeddings = nn.Sequential(
    # Input shape: (batch_size, n_cont_features)

    m_cont_embeddings,
    # After embeddings: (batch_size, n_cont_features, d_embedding)

    # `nn.Flatten` is not needed for Transformer-like architectures. (car traite features par features en vecteurs)
    nn.Flatten(),

    # After flattening: (batch_size, n_cont_features * d_embedding)

    MLP(d_in=n_cont_features * d_embedding, **mlp_config)
    # The final shape: (batch_size, d_out)
)
# The usage is the same as for the model without embeddings:
y_pred = model_with_embeddings(num_tensor)

In [6]:
y_pred

tensor([[-0.0053],
        [ 0.0523],
        [-0.0140],
        ...,
        [-0.0168],
        [ 0.0151],
        [ 0.0563]], grad_fn=<AddmmBackward0>)

In [7]:

# (Q) Quantile-based bins.
bins = compute_bins(num_tensor)
bins

[tensor([17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30.,
         31., 32., 33., 34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44.,
         45., 46., 47., 48., 49., 50., 51., 53., 54., 56., 58., 59., 62., 65.,
         69., 90.]),
 tensor([  12285.0000,   31053.0000,   36956.0000,   46699.0000,   55465.0000,
           67929.0078,   79627.0000,   89598.0000,   97939.0000,  103277.0000,
          108233.0000,  113151.0000,  117634.0000,  122353.0000,  127961.0547,
          134331.0000,  140581.0000,  145886.0312,  151580.0000,  156848.0000,
          161944.0000,  166416.0000,  170583.0000,  174685.0000,  177974.0000,
          182227.0000,  186035.0000,  189511.0000,  192932.0000,  196993.9688,
          200679.0000,  205175.0000,  210508.9844,  216208.0000,  222899.0000,
          229826.0000,  237735.0000,  246652.0000,  256416.9062,  267431.0000,
          279452.0000,  292472.0000,  308118.0000,  324791.0000,  342709.0000,
          362835.0000,  393829.

In [None]:

#Embeddings for QL/TL Linear(ple(x_i))
emb = PiecewiseLinearEncoding(bins)
new_rep = emb(num_tensor)



In [16]:
sum(len(b) - 1 for b in bins)

130

In [12]:
# MLP-Q / MLP-T
model_PLE = nn.Sequential(
    PiecewiseLinearEncoding(bins),
    nn.Flatten(),
    MLP(d_in=sum(len(b) - 1 for b in bins), **mlp_config)
)
y_pred = model_PLE(num_tensor)

In [13]:
y_pred

tensor([[-0.0828],
        [ 0.0061],
        [-0.0819],
        ...,
        [-0.0871],
        [-0.0593],
        [ 0.0005]], grad_fn=<AddmmBackward0>)

In [3]:
import torch
from torch.utils.data import DataLoader, Dataset

# Créer une classe de jeu de données personnalisée
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

# Exemple de données
data = torch.randn(100, 3)  # 100 échantillons de 3 caractéristiques

# Instancier le Dataset personnalisé
dataset = CustomDataset(data)

# Créer un DataLoader
batch_size = 32
shuffle = True
num_workers = 4
dataloader = DataLoader(data, batch_size=batch_size, shuffle=shuffle)

# Utiliser le DataLoader dans une boucle for
for batch in dataloader:
    # batch contient un lot de données
    print(batch.shape)  # Afficher la taille du lot
    # Utilisez le lot dans votre modèle PyTorch
    # Entraînez votre modèle, calculez les prédictions, etc.


torch.Size([32, 3])
torch.Size([32, 3])
torch.Size([32, 3])
torch.Size([4, 3])


In [4]:
import numpy as np

X = np.array([0.004911, 0.500111, 0.893677])

reshaped_X = X.reshape(-1, 1)

print(reshaped_X.shape)


(3, 1)
