In [None]:
! pip install pytorch_tabular[all]

In [None]:
! pip install pytorch_tabular


In [None]:
! git clone https://github.com/manujosephv/pytorch_tabular


In [None]:
%cd pytorch_tabular


In [None]:
%pwd


In [None]:
!python setup.py install


In [None]:
!pip install setuptools==59.5.0


In [None]:
%cd ..

In [None]:
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False
if not IN_COLAB:
    os.chdir("..")
%load_ext autoreload
%autoreload 2

from pytorch_tabular import TabularModel
from pytorch_tabular.models import CategoryEmbeddingModelConfig, FTTransformerConfig, FTTransformerModel, TabNetModelConfig, TabNetModel, AutoIntConfig, AutoIntConfig, TabTransformerConfig, TabTransformerModel
from pytorch_tabular.models import AutoIntModel, AutoIntConfig, NodeConfig, NODEModel
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig, ExperimentConfig, ModelConfig
from pytorch_tabular.models import BaseModel

import torch
import torch.nn as nn
import torch.nn.functional as F
from omegaconf import DictConfig
from typing import Dict
from dataclasses import dataclass, field


In [None]:
def make_mixed_regression(n_samples, n_features, n_categories):
    X,y = make_regression(n_samples=n_samples, n_features=n_features, random_state=42, n_informative=5, n_targets=1)
    cat_cols = random.choices(list(range(X.shape[-1])),k=n_categories)
    num_cols = [i for i in range(X.shape[-1]) if i not in cat_cols]
    for col in cat_cols:
        X[:,col] = pd.qcut(X[:,col], q=4).codes.astype(int)
    col_names = [] 
    num_col_names=[]
    cat_col_names=[]
    for i in range(X.shape[-1]):
        if i in cat_cols:
            col_names.append(f"cat_col_{i}")
            cat_col_names.append(f"cat_col_{i}")
        if i in num_cols:
            col_names.append(f"num_col_{i}")
            num_col_names.append(f"num_col_{i}")
    X = pd.DataFrame(X, columns=col_names)
    y = pd.DataFrame(y, columns=["target"])
    data = X.join(y)
    return data, cat_col_names, num_col_names

def print_metrics(y_true, y_pred, tag):
    if isinstance(y_true, pd.DataFrame) or isinstance(y_true, pd.Series):
        y_true = y_true.values
    if isinstance(y_pred, pd.DataFrame) or isinstance(y_pred, pd.Series):
        y_pred = y_pred.values
    if y_true.ndim>1:
        y_true=y_true.ravel()
    if y_pred.ndim>1:
        y_pred=y_pred.ravel()
    val_acc = mean_squared_error(y_true, y_pred)
    val_f1 = mean_absolute_error(y_true, y_pred)
    print(f"{tag} MSE: {val_acc} | {tag} MAE: {val_f1}")

data, cat_col_names, num_col_names = make_mixed_regression(n_samples=10000, n_features=4, n_categories=2)
df_train, df_test = train_test_split(data, random_state=42)
df_train, df_valid = train_test_split(data, random_state=42)

In [None]:
# FT - Transformer

epochs = 15
batch_size = 64
steps_per_epoch = int((len(df_train)//batch_size)*0.9)
data_config = DataConfig(
    target=['target'],
    continuous_cols=['num_col_0', 'num_col_3'],
    categorical_cols=['cat_col_1', 'cat_col_2'],
#         continuous_feature_transform="quantile_uniform"
)
trainer_config = TrainerConfig(
    auto_lr_find=False, # Runs the LRFinder to automatically derive a learning rate
    batch_size=batch_size,
    max_epochs=epochs,
    early_stopping_patience = 5,
    gpus=1,  #index of the GPU to use. -1 means all available GPUs, None, means CPU
)
# optimizer_config = OptimizerConfig(lr_scheduler="OneCycleLR", lr_scheduler_params={"max_lr":0.005, "epochs": epochs, "steps_per_epoch":steps_per_epoch})

optimizer_config = OptimizerConfig(lr_scheduler="ReduceLROnPlateau", lr_scheduler_params={"patience":3})


model_config = FTTransformerConfig(
    task = "regression",
    learning_rate=1e-3,
    seed = 13,
    input_embed_dim = 32,
    num_heads = 8,
    num_attn_blocks = 6,
    ff_dropout = 0.1,
    out_ff_layers = "128-64-32",
    out_ff_activation = "LeakyReLU",
    out_ff_initialization="kaiming",
    batch_norm_continuous_input=False,
    #         target_range=[(df_train[col].min(),df_train[col].max()) for col in ['target']]
)



tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config
)

In [None]:
tabular_model.fit(train=df_train, validation=df_valid)

In [None]:
#prediction and visualization

In [None]:
pred_df = tabular_model.predict(df_test, quantiles=[0.15,0.5,0.85], n_samples=1000, ret_logits=False)
pred_df.head()

In [None]:
pred_df = tabular_model.predict(df_test, quantiles=[0.15,0.5,0.85], n_samples=1000, ret_logits=False)
pred_df.head()

print_metrics(pred_df['target'], pred_df["target_prediction"], tag="Holdout")

In [None]:
plt.scatter(pred_df['num_col_0'], pred_df['target'], color = 'blue')
plt.scatter(pred_df['num_col_0'], pred_df['target_prediction'], color = 'red')

In [None]:
plt.scatter(pred_df['num_col_3'], pred_df['target'], color = 'blue')
plt.scatter(pred_df['num_col_3'], pred_df['target_prediction'], color = 'red')

In [None]:
plt.scatter(pred_df['cat_col_1'], pred_df['target'], color = 'blue')
plt.scatter(pred_df['cat_col_1'], pred_df['target_prediction'], color = 'red')

In [None]:
plt.scatter(pred_df['cat_col_2'], pred_df['target'], color = 'blue')
plt.scatter(pred_df['cat_col_2'], pred_df['target_prediction'], color = 'red')

In [None]:
import random
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import plotly.express as px
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode

In [None]:
# TabNet

epochs = 15
batch_size = 64
steps_per_epoch = int((len(df_train)//batch_size)*0.9)
data_config = DataConfig(
    target=['target'],
    continuous_cols=['num_col_2', 'num_col_3'],
    categorical_cols=['cat_col_0', 'cat_col_1'],
#         continuous_feature_transform="quantile_uniform"
)
trainer_config = TrainerConfig(
    auto_lr_find=False, # Runs the LRFinder to automatically derive a learning rate
    batch_size=batch_size,
    max_epochs=epochs,
    early_stopping_patience = 5,
    gpus=1,  #index of the GPU to use. -1 means all available GPUs, None, means CPU
)
# optimizer_config = OptimizerConfig(lr_scheduler="OneCycleLR", lr_scheduler_params={"max_lr":0.005, "epochs": epochs, "steps_per_epoch":steps_per_epoch})

optimizer_config = OptimizerConfig(lr_scheduler="ReduceLROnPlateau", lr_scheduler_params={"patience":3})


model_config = TabNetModelConfig(
    task = "regression"
)



tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config
)

tabular_model.fit(train=df_train, validation=df_valid)

In [None]:
pred_df = tabular_model.predict(df_test, quantiles=[0.25,0.5,0.75], n_samples=100, ret_logits=False)
pred_df.head()

print_metrics(pred_df['target'], pred_df["target_prediction"], tag="Holdout")

In [None]:
# Node

epochs = 15
batch_size = 64
steps_per_epoch = int((len(df_train)//batch_size)*0.9)
data_config = DataConfig(
    target=['target'],
    continuous_cols=['num_col_2', 'num_col_3'],
    categorical_cols=['cat_col_0', 'cat_col_1'],
#         continuous_feature_transform="quantile_uniform"
)
trainer_config = TrainerConfig(
    auto_lr_find=False, # Runs the LRFinder to automatically derive a learning rate
    batch_size=batch_size,
    max_epochs=epochs,
    early_stopping_patience = 5,
    gpus=1,  #index of the GPU to use. -1 means all available GPUs, None, means CPU
)
# optimizer_config = OptimizerConfig(lr_scheduler="OneCycleLR", lr_scheduler_params={"max_lr":0.005, "epochs": epochs, "steps_per_epoch":steps_per_epoch})

optimizer_config = OptimizerConfig(lr_scheduler="ReduceLROnPlateau", lr_scheduler_params={"patience":3})


model_config = NodeConfig(
    task = "regression"
)



tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config
)

tabular_model.fit(train=df_train, validation=df_valid)

In [None]:
pred_df = tabular_model.predict(df_test, quantiles=[0.25,0.5,0.75], n_samples=100, ret_logits=False)
pred_df.head()



In [None]:
print_metrics(pred_df['target'], pred_df["target_prediction"], tag="Holdout")

In [None]:
# TabTransformer


epochs = 15
batch_size = 64
steps_per_epoch = int((len(df_train)//batch_size)*0.9)
data_config = DataConfig(
    target=['target'],
    continuous_cols=['num_col_2', 'num_col_3'],
    categorical_cols=['cat_col_0', 'cat_col_1'],
#         continuous_feature_transform="quantile_uniform"
)
trainer_config = TrainerConfig(
    auto_lr_find=False, # Runs the LRFinder to automatically derive a learning rate
    batch_size=batch_size,
    max_epochs=epochs,
    early_stopping_patience = 5,
    gpus=1,  #index of the GPU to use. -1 means all available GPUs, None, means CPU
)
# optimizer_config = OptimizerConfig(lr_scheduler="OneCycleLR", lr_scheduler_params={"max_lr":0.005, "epochs": epochs, "steps_per_epoch":steps_per_epoch})

optimizer_config = OptimizerConfig(lr_scheduler="ReduceLROnPlateau", lr_scheduler_params={"patience":3})


model_config = TabTransformerConfig(
    task = "regression"
)



tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config
)

tabular_model.fit(train=df_train, validation=df_valid)



In [None]:
pred_df = tabular_model.predict(df_test, quantiles=[0.25,0.5,0.75], n_samples=100, ret_logits=False)
pred_df.head()



In [None]:
print_metrics(pred_df['target'], pred_df["target_prediction"], tag="Holdout")

In [None]:
# AutoInt

epochs = 15
batch_size = 64
steps_per_epoch = int((len(df_train)//batch_size)*0.9)
data_config = DataConfig(
    target=['target'],
    continuous_cols=['num_col_2', 'num_col_3'],
    categorical_cols=['cat_col_0', 'cat_col_1'],
#         continuous_feature_transform="quantile_uniform"
)
trainer_config = TrainerConfig(
    auto_lr_find=False, # Runs the LRFinder to automatically derive a learning rate
    batch_size=batch_size,
    max_epochs=epochs,
    early_stopping_patience = 5,
    gpus=1,  #index of the GPU to use. -1 means all available GPUs, None, means CPU
)
# optimizer_config = OptimizerConfig(lr_scheduler="OneCycleLR", lr_scheduler_params={"max_lr":0.005, "epochs": epochs, "steps_per_epoch":steps_per_epoch})

optimizer_config = OptimizerConfig(lr_scheduler="ReduceLROnPlateau", lr_scheduler_params={"patience":3})


model_config = AutoIntConfig(
    task = "regression"
)



tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config
)

tabular_model.fit(train=df_train, validation=df_valid)



In [None]:
pred_df = tabular_model.predict(df_test, quantiles=[0.25,0.5,0.75], n_samples=100, ret_logits=False)
pred_df.head()



In [None]:
print_metrics(pred_df['target'], pred_df["target_prediction"], tag="Holdout")

In [None]:
def uncertainity_estimate(x, model, num_samples, l2):
    outputs = np.hstack([model(x).cpu().detach().numpy() for i in range(num_samples)]) # n번 inference, output.shape = [20, N]
    y_mean = outputs.mean(axis=1)
    y_variance = outputs.var(axis=1)
    tau = l2 * (1. - model.dropout_rate) / (2. * N * model.decay)
    y_variance += (1. / tau)
    y_std = np.sqrt(y_variance)
    return y_mean, y_std