In [9]:
import pandas as pd
import numpy as np

from pytorch_tabnet.tab_model import TabNetRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [17]:
data = pd.read_csv('./data/data.csv').drop(['Unnamed: 0'], axis=1)
data['is_test'] = 0
test_loc = np.random.choice(data.index, size=10_000, replace=False)
data.loc[test_loc, 'is_test'] = 1

In [18]:
numerical_features = [f'numeric_{i}' for i in range(10)]
categorical_features = [f'categ_{i}' for i in range(2)]
features = numerical_features + categorical_features

In [19]:
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ])

#we add this only to add something in the final column transformer for the categ feats
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=-1)),
    ])

preprocessor = ColumnTransformer(       
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)]
)

preprocessor.fit(data.query('is_test == 0')[features])

ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('scaler', StandardScaler())]),
                                 ['numeric_0', 'numeric_1', 'numeric_2',
                                  'numeric_3', 'numeric_4', 'numeric_5',
                                  'numeric_6', 'numeric_7', 'numeric_8',
                                  'numeric_9']),
                                ('cat',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(fill_value=-1,
                                                                strategy='constant'))]),
                                 ['categ_0', 'categ_1'])])

In [20]:
transf_feats = pd.DataFrame(
    preprocessor.transform(data[features]), 
    columns=features
)

transf_feats.head()

Unnamed: 0,numeric_0,numeric_1,numeric_2,numeric_3,numeric_4,numeric_5,numeric_6,numeric_7,numeric_8,numeric_9,categ_0,categ_1
0,-0.14252,0.244563,0.247198,0.543519,-0.305092,-0.407049,-0.121061,-0.459183,0.676633,1.105332,0.0,0.0
1,-0.295731,1.167725,0.02878,-0.466646,-0.312196,-0.643645,-0.536148,-0.45915,0.629719,-0.571474,0.0,2.0
2,-0.358738,-0.627757,0.787211,-0.245488,0.313643,-0.849559,0.643323,3.55923,0.259474,1.32973,0.0,4.0
3,4.144482,0.174749,0.650405,-0.030877,-0.013285,-0.582646,1.007167,-0.459119,0.675755,-0.76656,0.0,1.0
4,1.16804,-0.513689,0.05638,0.483031,-0.311699,0.560803,0.86814,-0.430382,0.060544,-1.507315,1.0,0.0


In [21]:
def get_emb_size_tio_jeremy(n_cat):
    return min(10, round(1.6 * n_cat**0.56))

In [25]:
cat_dims = [data[feature].nunique() for i, feature in enumerate(categorical_features)]
cat_idxs = [i for i, f in enumerate(features) if f in categorical_features]
cat_emb_dim = [get_emb_size_tio_jeremy(n_cat) for n_cat in cat_dims]

In [26]:
regressor = TabNetRegressor(
    cat_dims=cat_dims, 
    cat_emb_dim=cat_emb_dim, 
    cat_idxs=cat_idxs
)

Device used : cpu


In [27]:
data['target']

0         8.934105
1         5.497389
2         6.856786
3         7.036078
4        -6.328707
           ...    
99995    24.311125
99996    -7.669898
99997   -11.436065
99998   -10.926239
99999    -1.751841
Name: target, Length: 100000, dtype: float64

In [29]:
X_train, X_test = data.query('is_test == 0')[features], data.query('is_test == 1')[features]
y_train, y_test = data.query('is_test == 0')[["target"]], data.query('is_test == 1')[["target"]]


In [30]:
max_epochs = 70
regressor.fit(
    X_train=X_train, y_train=y_train,
    eval_set=[(X_train, y_train), (X_test, y_test)],
    eval_name=['train', 'test'],
    eval_metric=['mae'],
    max_epochs=max_epochs,
    patience=50,
    batch_size=1024, virtual_batch_size=128,
    num_workers=0,
    drop_last=False
)
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                       ('regressor', regressor)])

KeyError: 69934