# Pytorch Tabular

In [1]:
import numpy as np
import pandas as pd

# Pytorch Tabular
from pytorch_tabular import TabularModel
from pytorch_tabular.config import (
    DataConfig,
    OptimizerConfig,
    TrainerConfig,
)

## Preparar Dataset

### Create Dataset

In [2]:
"""
from pytorch_tabular.utils import make_mixed_dataset
df, cat_col_names, num_col_names = make_mixed_dataset(
    task="classification", n_samples=3000, n_features=7, n_categories=4
)
date_col_names = []
target_col = ['target']
"""

'\nfrom pytorch_tabular.utils import make_mixed_dataset\ndf, cat_col_names, num_col_names = make_mixed_dataset(\n    task="classification", n_samples=3000, n_features=7, n_categories=4\n)\ndate_col_names = []\ntarget_col = [\'target\']\n'

## Insurance Lead Prediction Raw Data Dataset

https://www.kaggle.com/datasets/owaiskhan9654/health-insurance-lead-prediction-raw-data

In [3]:
"""
df = pd.read_csv('dataset/Health Insurance Lead Prediction Raw Data.csv')

df = df.drop('ID', axis=1)
df = df.dropna()

df['Holding_Policy_Duration'].replace('14+', 14.0, inplace=True)

num_col_names = ['Region_Code', 'Upper_Age', 'Lower_Age', 'Holding_Policy_Duration', 'Reco_Policy_Premium',]
cat_col_names = ['City_Code', 'Accomodation_Type', 'Reco_Insurance_Type', 'Is_Spouse', 'Health Indicator', 'Holding_Policy_Type', 'Reco_Policy_Cat']
date_col_names = []
target_col = ['Response']
df
"""

"\ndf = pd.read_csv('dataset/Health Insurance Lead Prediction Raw Data.csv')\n\ndf = df.drop('ID', axis=1)\ndf = df.dropna()\n\ndf['Holding_Policy_Duration'].replace('14+', 14.0, inplace=True)\n\nnum_col_names = ['Region_Code', 'Upper_Age', 'Lower_Age', 'Holding_Policy_Duration', 'Reco_Policy_Premium',]\ncat_col_names = ['City_Code', 'Accomodation_Type', 'Reco_Insurance_Type', 'Is_Spouse', 'Health Indicator', 'Holding_Policy_Type', 'Reco_Policy_Cat']\ndate_col_names = []\ntarget_col = ['Response']\ndf\n"

### Water Quality and Potability Dataset

https://www.kaggle.com/datasets/adityakadiwal/water-potability

In [4]:
df = pd.read_csv('dataset/water_potability.csv')

num_col_names = list(df.keys())[:-1]
cat_col_names = []
date_col_names = []
target_col = ['Potability']

df

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.990970,2.963135,0
1,3.716080,129.422921,18630.057858,6.635246,,592.885359,15.180013,56.329076,4.500656,0
2,8.099124,224.236259,19909.541732,9.275884,,418.606213,16.868637,66.420093,3.055934,0
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771,0
4,9.092223,181.101509,17978.986339,6.546600,310.135738,398.410813,11.558279,31.997993,4.075075,0
...,...,...,...,...,...,...,...,...,...,...
3271,4.668102,193.681735,47580.991603,7.166639,359.948574,526.424171,13.894419,66.687695,4.435821,1
3272,7.808856,193.553212,17329.802160,8.061362,,392.449580,19.903225,,2.798243,1
3273,9.419510,175.762646,33155.578218,7.350233,,432.044783,11.039070,69.845400,3.298875,1
3274,5.126763,230.603758,11983.869376,6.303357,,402.883113,11.168946,77.488213,4.708658,1


In [5]:
df = df.dropna()
df[target_col].value_counts()

Potability
0             1200
1              811
Name: count, dtype: int64

### Dataset Splitting

In [6]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, random_state=42, test_size=0.3)
valid, test = train_test_split(test, random_state=42, test_size=0.5)

## Train Pytorch Tabular
https://github.com/manujosephv/pytorch_tabular

### Preparar Configurações

No [DataConfig](https://pytorch-tabular.readthedocs.io/en/latest/data/#pytorch_tabular.config.DataConfig) estão as configurações do dataset, contendo as coluna no DataFrame Pandas que é o "target" (valor que será previsto), colunas que possuem dados continuos e de dados categóricos. Além de outras configurações que podem ser alteradas.

In [7]:
data_config = DataConfig(
    target=target_col,  # target should always be a list. Multi-targets are only supported for regression. Multi-Task Classification is not implemented
    continuous_cols=num_col_names,  # Column names of the numeric fields
    categorical_cols=cat_col_names,  # Column names of the categorical fields
    date_columns=date_col_names,  # (Column name, Freq, Format) tuples of the date fields. E.g. [("date", "M", %Y-%m')]. freq = https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases
    normalize_continuous_features=True, # Flag to normalize the input features (continuous). Default True
    num_workers=4
)

No [TrainerConfig](https://pytorch-tabular.readthedocs.io/en/latest/training/#using-the-entire-pytorch-lightning-trainer) estão as configurações de treinamento, incluindo quais métricas iremos utilizar para avaliar o modelo e criar os checkpoints

No [OptimizerConfig](https://pytorch-tabular.readthedocs.io/en/latest/optimizer/#pytorch_tabular.config.OptimizerConfig) estão as configurações do otimizado que será utilizador, sendo inclusive possível alterar seus parâmetros.

In [8]:
trainer_config = TrainerConfig(
    auto_lr_find=True,  # Runs the LRFinder to automatically derive a learning rate. Default False
    batch_size=64,
    max_epochs=300,
    min_epochs=3,
    early_stopping='valid_accuracy', # Monitor for early stopping. valid_loss or valid_accuracy
    early_stopping_mode = 'max', # Set the mode as min for val_loss (lower is better) and max for val_accuracy (higher is better)
    early_stopping_patience=10, # No. of epochs of degradation training will wait before terminating
    checkpoints='valid_accuracy', # Save best checkpoint monitoring
    checkpoints_mode='max',  # Set the mode as min for valid_loss (lower is better) and max for valid_accuracy (higher is better)
)

optimizer_config = OptimizerConfig(
    #optimizer='RMSprop',  # https://pytorch.org/docs/stable/optim.html#algorithms
    #optimizer_params={'alpha': 0.99},
    #lr_scheduler="StepLR",
    #lr_scheduler_params={"step_size": 10},
)

### Train Specific Architecture

No [ModelConfig](https://pytorch-tabular.readthedocs.io/en/latest/apidocs_model/#pytorch_tabular.config.ModelConfig) estão as configurações da arquitetura que será utilizada e os parâmetros serão adaptados para cada uma delas, sendo necessário olhar sua documentação. Ressaltando que todas as arquiteturas erdam da classe "ModelConfig".

O único parâmetro obrigatório é a "task" para informar se a saída será "classificação", "regressão" ou "backbone" (retorna os embeddings). Já referentes as arquiteturas, elas já possuem parâmetros (hiperparâmetros) pré-configurados e não são necessários alterar (mas é recomendado para buscar melhores resultados e para isso recomenda-se utilizar o Tuner que está mais para frente neste Notebook). 

In [9]:
from pytorch_tabular import available_models
available_models()

['AutoIntConfig',
 'CategoryEmbeddingModelConfig',
 'DANetConfig',
 'FTTransformerConfig',
 'GANDALFConfig',
 'GatedAdditiveTreeEnsembleConfig',
 'MDNConfig',
 'NodeConfig',
 'TabNetModelConfig',
 'TabTransformerConfig']

Neste caso iremos utilizar a arquitetura [Category Embedding](https://pytorch-tabular.readthedocs.io/en/latest/apidocs_model/#pytorch_tabular.models.CategoryEmbeddingModelConfig).

In [10]:
from pytorch_tabular.models import CategoryEmbeddingModelConfig

In [11]:
model_config_category_embedding = CategoryEmbeddingModelConfig(
    task='classification',  #  One of "classification", "regression" or "backbone"
    layers='128-64-32',  # Number of nodes in each layer. Default to 128-64-32
    activation='ReLU',  # Activation between each layers. The default activations in PyTorch like ReLU, TanH, LeakyReLU, etc. Defaults to ReLU
    #learning_rate=1e-3,
    #loss='CrossEntropyLoss',  # Can use any loss function from standard PyTorch (torch.nn) through this config. By default it is set to MSELoss for regression and CrossEntropyLoss for classification
    #metrics=['accuracy'],  # Metrics like "accuracy" and "f1_score". By default, it is accuracy if classification and mean_squared_error for regression - https://lightning.ai/docs/torchmetrics/stable/all-metrics.html (function name is in "Functional Interface")
    #target_range=[], # [(10.5, 30.1)] = (Limit the output between 10.5 and 30.1) The range in which we should limit the output variable. Currently ignored for multi-target regression. Typically used for Regression problems. If left empty, will not apply any restrictions
)

No [TabularModel](https://pytorch-tabular.readthedocs.io/en/latest/tabular_model/) ficam todas as configurações realizadas e é onde está definitivamente nosso modelo. Ele que iremos usar para treinar e depois para fazer inferências.

In [12]:
best_model = TabularModel(
    data_config=data_config,
    model_config=model_config_category_embedding,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
    verbose=False  # Turn off the verbose to avoid printing logs from different stages
)

In [13]:
best_model.fit(train=train, validation=valid)

Seed set to 42
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 3060') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
/home/max/anaconda3/envs/tabular/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:639: Checkpoint directory /home/max/Documents/PA/pytorch_tabular/machine_learning/saved_models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/home/max/anaconda3/envs/tabular/lib/python3.10/site-packages/pytorch_lightning/loops/fit_loop.py:293: The number of training batches (22) is smaller than the logging interval Trainer(log_every_n_s

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_steps=100` reached.
Learning rate set to 0.0003019951720402019
Restoring states from the checkpoint path at /home/max/Documents/PA/pytorch_tabular/machine_learning/.lr_find_1fde7199-0b15-4823-84b7-c40220766f05.ckpt
Restored all states from the checkpoint at /home/max/Documents/PA/pytorch_tabular/machine_learning/.lr_find_1fde7199-0b15-4823-84b7-c40220766f05.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

<pytorch_lightning.trainer.trainer.Trainer at 0x7cde188a46a0>

In [14]:
best_model.evaluate(test)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

[{'test_loss_0': 0.5982664227485657,
  'test_loss': 0.5982664227485657,
  'test_accuracy': 0.6490066051483154}]

In [15]:
# Save best model
#best_model.save_model('results/best_model', inference_only=True)

## Train Many Models
Entre tantas alternativas é difícil saber qual a melhora arquitetura utilizar, sem falar que muitas vezes uma arquitetura tem um desempenho muito bom em determinado assunto/dataset, mas isso não se mostra verdade entre outro cenário, dificultando escolher qual utilizar.

Para ajudar nesse dilema o Pytorch Tabular possui a função [model_sweep](https://pytorch-tabular.readthedocs.io/en/latest/apidocs_coreclasses/#pytorch_tabular.model_sweep) que roda todas as arquiteturas com os hiperpâmetros padrões e seu resultado consiste no comparativo entre todos eles. Dessa forma sabemos em qual/quais arquiteturas devemos investir mais tempo para buscar melhorar seus hiperparâmetros.

Vamos rodar a categoria "high_memory" que irá treinar todos modelos, inclusive os que utilizam bastante memória. Mas não se preocupe, se a memória não for suficiente a biblioteca irá ignorar o modelo e na tabela de resultados irá informar que não tinha memória suficiente (OOM). Ressaltando que pode ser interessante ter um batch_size pequeno para conseguir treinar todos os modelos sem resultar em OOM, ou rodar o modelo específico que deu OOM depois isoladamente com o batch_size que for suficiente :).

In [16]:
from pytorch_tabular import model_sweep, tabular_model_sweep

In [17]:
tabular_model_sweep.MODEL_SWEEP_PRESETS

{'lite': [('CategoryEmbeddingModelConfig', {'layers': '256-128-64'}),
  ('GANDALFConfig', {'gflu_stages': 6}),
  ('TabNetModelConfig',
   {'n_d': 32,
    'n_a': 32,
    'n_steps': 3,
    'gamma': 1.5,
    'n_independent': 1,
    'n_shared': 2})],
 'standard': [('CategoryEmbeddingModelConfig', {'layers': '256-128-64'}),
  ('CategoryEmbeddingModelConfig', {'layers': '512-128-64'}),
  ('GANDALFConfig', {'gflu_stages': 6}),
  ('GANDALFConfig', {'gflu_stages': 15}),
  ('TabNetModelConfig',
   {'n_d': 32,
    'n_a': 32,
    'n_steps': 3,
    'gamma': 1.5,
    'n_independent': 1,
    'n_shared': 2}),
  ('TabNetModelConfig',
   {'n_d': 32,
    'n_a': 32,
    'n_steps': 5,
    'gamma': 1.5,
    'n_independent': 2,
    'n_shared': 3}),
  ('FTTransformerConfig', {'num_heads': 4, 'num_attn_blocks': 4})],
 'full': ['AutoIntConfig',
  'CategoryEmbeddingModelConfig',
  'DANetConfig',
  'FTTransformerConfig',
  'GANDALFConfig',
  'GatedAdditiveTreeEnsembleConfig',
  'TabNetModelConfig',
  'TabTransfor

In [18]:
sweep_df, best_model = model_sweep(
    task='classification',  # 'classification' or 'regression'
    train=train,
    test=valid,
    data_config=data_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
    model_list='high_memory',  # The list of models to compare. This can be one of the presets defined in pytorch_tabular.tabular_model_sweep.MODEL_SWEEP_PRESETS or a list of ModelConfig objects. Defaults to "lite".
    verbose=False
)

Output()



Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

O melhor checkpoint de todos treinamentos será armazenado na variável de output, que neste caso estamos chamando de "best_model".

In [19]:
best_model.evaluate(test)

[{'test_loss_0': 0.5931517481803894,
  'test_loss': 0.5931517481803894,
  'test_accuracy': 0.695364236831665}]

Lembrando que: Cada vez que rodar o treinamento o resultado pode ser diferente!

In [20]:
# Replace "OOM" with 0.0
sweep_df.replace(["OOM", np.nan], 0.0, inplace=True)
sweep_df.replace([np.inf], 999., inplace=True)

sweep_df.drop(columns=['params', 'time_taken']).sort_values('test_accuracy', ascending=False).style.background_gradient(
    subset=['test_accuracy'], cmap="RdYlGn"
).background_gradient(subset=['time_taken_per_epoch', 'test_loss'], cmap='RdYlGn_r')

Unnamed: 0,model,# Params,epochs,test_loss_0,test_loss,test_accuracy,time_taken_per_epoch
1,CategoryEmbeddingModel,11 T,21,0.605351,0.605351,0.708609,0.341039
0,AutoIntModel,14 T,14,0.614532,0.614532,0.701987,0.434277
7,TabNetModel,6 T,29,0.66123,0.66123,0.68543,0.636098
6,NODEModel,790 T,14,0.588302,0.588302,0.675497,1.580799
4,GANDALFModel,3 T,15,0.6892,0.6892,0.622517,0.58272
5,GatedAdditiveTreeEnsembleModel,69 T,10,0.678961,0.678961,0.586093,3.197777
3,FTTransformerModel,272 T,11,0.687022,0.687022,0.582781,0.734422
2,DANetModel,398 T,11,73.512764,73.512764,0.543046,1.412278
8,TabTransformerModel,271 T,11,0.782108,0.782108,0.503311,0.520263


In [21]:
# Save best model
#best_model.save_model('results/best_model', inference_only=True)

### Fine-tuning Hyperparametrs
Show, agora que sabemos as melhores arquiteturas para o nosso modelo, podemos brincar com ela(s) para buscar melhores hiperparâmetros para termos resultados ainda melhores. Para isso o Pytorch Tabular disponibilizar a função [tuner](https://pytorch-tabular.readthedocs.io/en/latest/apidocs_coreclasses/#pytorch_tabular.TabularModelTuner.tune).

Configurações dos modelos: https://pytorch-tabular.readthedocs.io/en/latest/apidocs_model/

Mais informações de como o hyperparameter space funcionam: https://pytorch-tabular.readthedocs.io/en/latest/tutorials/10-Hyperparameter%20Tuning/#define-the-hyperparameter-space

Neste caso vamos pegar as duas arquiteturas que se sairam bem e brinca com elas.

In [22]:
from pytorch_tabular.tabular_model_tuner import TabularModelTuner

from pytorch_tabular.models import (
    CategoryEmbeddingModelConfig,
    AutoIntConfig
)   

As configurações dos spaces que serão utilizados sempre irão começar com nome da configuração, dois underlines e o parâmetro. Sendo seus valores uma lista com todas as configurações que deseja testar.

In [23]:
model_config_category_embedding = CategoryEmbeddingModelConfig(task="classification")

search_space_category_embedding = {
    "optimizer_config__optimizer": ["Adam", "SGD"],
    "model_config__layers": ["128-64-32", "1024-512-256", "32-64-128", "256-512-1024"],
    "model_config__activation": ["ReLU", "LeakyReLU"],
    "model_config__embedding_dropout": [0.0, 0.2],
}

In [24]:
model_config_auto_int = AutoIntConfig(task="classification")

search_space_auto_int = {
    "optimizer_config__optimizer": ["Adam", "SGD"],
    "model_config__layers": ["128-64-32", "1024-512-256"],
    "model_config__attn_embed_dim": ["32", "64"],
    "model_config__activation": ["ReLU", "LeakyReLU"],
    "model_config__embedding_dropout": [0.0, 0.2],
}

Para informar o tuner quais arquiteturas e hiperparâmetros testar, é necessário adicioná-los em listas. Ressaltando que ambas listas devem estar ordenadas iguais e ter o mesmo tamanho (irá utilizar primeiro elemento de models com primeiro de search_space, segundo com segundo,...)

In [25]:
all_models = [model_config_category_embedding, model_config_auto_int]
all_search_space = [search_space_category_embedding, search_space_auto_int]

O tuner possui duas "strategy" principais:
- grid_search: Para pesquisar todos os hiperparâmetros que foram definidos, mas lembre-se que cada novo campo que você adicionar aumentará consideravelmente o tempo total de treinamento. Se você configurar 4 otimizadores, 4 layes, 2 ativações e 2 dropout, isso significa 64 (4 * 4 * 2 * 2) treinamentos.
- random_search: Obterá aleatoriamente configurações de hiperparâmetros "n_trials" de cada modelo que foi definido. É útil para um treinamento mais rápido, mas lembre-se de que não testará todos os hiperparâmetros.

In [26]:
tuner = TabularModelTuner(
    data_config=data_config,
    model_config=all_models,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config
)

tuner_df = tuner.tune(
    train=train,
    validation=valid,
    search_space=all_search_space,
    strategy="grid_search",  # grid_search (seach in all search_spaces) or random_search (search in n_trials random search_spaces)
    n_trials=4,  # Used when strategy is random_search
    metric="accuracy",
    mode="max",  # max or min
    progress_bar=True,
    verbose=True # Make True if you want to log metrics and params each iteration
)

Output()

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

O Tuner retorna uma única variável contendo [tabela deresultados, parâmetros do melhor modelo, valor da melhor métrica, melhor modelo].

In [27]:
tuner_df.trials_df.sort_values("accuracy", ascending=False).style.background_gradient(
    subset=["accuracy"], cmap="RdYlGn"
).background_gradient(subset=["loss"], cmap="RdYlGn_r")

Unnamed: 0,trial_id,model,model_config__activation,model_config__embedding_dropout,model_config__layers,optimizer_config__optimizer,loss_0,loss,accuracy,model_config__attn_embed_dim
22,22,0-CategoryEmbeddingModelConfig,LeakyReLU,0.0,256-512-1024,Adam,0.683077,0.683077,0.728477,
32,0,1-AutoIntConfig,ReLU,0.0,128-64-32,Adam,0.607369,0.607369,0.721854,32.0
23,23,0-CategoryEmbeddingModelConfig,LeakyReLU,0.0,256-512-1024,SGD,0.598071,0.598071,0.721854,
1,1,0-CategoryEmbeddingModelConfig,ReLU,0.0,128-64-32,SGD,0.640602,0.640602,0.721854,
55,23,1-AutoIntConfig,LeakyReLU,0.2,1024-512-256,SGD,0.610837,0.610837,0.715232,32.0
31,31,0-CategoryEmbeddingModelConfig,LeakyReLU,0.2,256-512-1024,SGD,0.596345,0.596345,0.715232,
28,28,0-CategoryEmbeddingModelConfig,LeakyReLU,0.2,32-64-128,Adam,0.602533,0.602533,0.715232,
20,20,0-CategoryEmbeddingModelConfig,LeakyReLU,0.0,32-64-128,Adam,0.593532,0.593532,0.715232,
30,30,0-CategoryEmbeddingModelConfig,LeakyReLU,0.2,256-512-1024,Adam,0.584957,0.584957,0.711921,
57,25,1-AutoIntConfig,LeakyReLU,0.0,128-64-32,SGD,0.619494,0.619494,0.711921,64.0


In [28]:
tuner_df.best_model.evaluate(test)

[{'test_loss_0': 0.6975728869438171,
  'test_loss': 0.6975728869438171,
  'test_accuracy': 0.6688741445541382}]

### Save Model

In [29]:
tuner_df.best_model.save_model('results/best_model', inference_only=True)

## Inference

### Load Model

In [30]:
from pytorch_tabular import TabularModel
loaded_model = TabularModel.load_model('results/best_model')



### Predict
Precisar estar no formato DataFrame do Pandas com os nomes das colunas que participaram do treinamento para passar para o modelo fazer inferência.

In [31]:
# Data from Water Quality and Potability Dataset
input = [
    [5.949519, 160.442631, 16898.808297, 6.045906, 367.328542, 451.012788, 16.359951, 62.368234, 4.072198]
]

In [32]:
# Columns of Water Quality and Potability Dataset
input_df = pd.DataFrame(input, columns=['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity', 'Organic_carbon', 'Trihalomethanes', 'Turbidity'])

In [33]:
input_df

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity
0,5.949519,160.442631,16898.808297,6.045906,367.328542,451.012788,16.359951,62.368234,4.072198


In [34]:
loaded_model.predict(input_df)

Unnamed: 0,Potability_0_probability,Potability_1_probability,Potability_prediction
0,0.605089,0.394911,0
