In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from pytorch_tabnet.tab_model import TabNetClassifier

import torch
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
np.random.seed(0)

import os, sys; 
column_path = os.path.dirname(os.path.realpath('C:/Users/sgopalakrish/Downloads/intellizenz-model-training/Neuro-symbolic-AI/column.py'))
if sys.path.__contains__(column_path)==False:
    sys.path.append(column_path)

from sklearn.preprocessing import LabelEncoder
from pytorch_tabnet.augmentations import ClassificationSMOTE

import column
import wandb
from matplotlib import pyplot as plt
%matplotlib inline

## ===============================================

## 1. Load data and split

In [None]:
target = "veranst_segment"
# feature_columns = column.features_v5  #143 features
feature_columns = column.features_v6  #80 features

data_path = column.data_path_2016_2020_v3
df = pd.read_parquet(data_path)


# class_frequency = df.groupby('veranst_segment')['veranst_segment'].transform('count')
# df_sampled = df.sample(n=70000, weights=class_frequency, random_state=2)

# df_sampled = df_sampled[feature_columns]
df_sampled = df[feature_columns]

df_sampled = df_sampled.loc[:,~df_sampled.columns.isin(['vg_inkasso', 'tarif_bez'])] #141 features
df_sampled = df_sampled.fillna(-1) # Fill the Empty NaN values in all the cells with -1



In [None]:
##################################################################################################

target = "veranst_segment"
feature_columns = column.features_v7  #77 features # without tarif

data_path = column.data_path_2016_2020_v4
df = pd.read_parquet(data_path)

# class_frequency = df.groupby('veranst_segment')['veranst_segment'].transform('count')
# df_sampled = df.sample(n=70000, weights=class_frequency, random_state=2)
# df_sampled = df.sample(n=300000, weights=class_frequency, random_state=2)
df_sampled = df.copy()

In [None]:
#####with Tarif
target = "veranst_segment"

feature_columns = column.features_v8 #78 features including tarif_bez
data_path = column.data_path_2016_2020_v5 # un-encoded tarif-bez
df = pd.read_parquet(data_path)

# feature_columns = column.features_v2 #140 features # doesn't include tarif_bez
# data_path = column.data_path_2016_2020_v3
# df = pd.read_parquet(data_path)
# all_columns = column.features_v2 + [target]
# df = df[all_columns]

class_frequency = df.groupby('veranst_segment')['veranst_segment'].transform('count')
# df_sampled = df.sample(n=70000, weights=class_frequency, random_state=2)
df_sampled = df.sample(n=300000, weights=class_frequency, random_state=2)

# df_sampled = df_sampled[feature_columns]
# df_sampled = df[feature_columns]
# df_sampled = df.copy()

### Define Categorical features for categorical embeddings

In [None]:
nunique = df_sampled.nunique()
types = df_sampled.dtypes

categorical_columns = []
categorical_dims =  {}
for col in df_sampled.columns:
    if types[col] == 'object' or nunique[col] < 200:
        print(col, df_sampled[col].nunique())
        l_enc = LabelEncoder()
        df_sampled[col] = l_enc.fit_transform(df_sampled[col].values)
        categorical_columns.append(col)
        categorical_dims[col] = len(l_enc.classes_)

In [None]:
cat_idxs = [ i for i, f in enumerate(feature_columns) if f in categorical_columns]
cat_dims = [ categorical_dims[f] for i, f in enumerate(feature_columns) if f in categorical_columns]
print(cat_idxs)
print(cat_dims)

In [None]:
n_total = len(df_sampled)

# Train, val and test split follows
# Rory Mitchell, Andrey Adinets, Thejaswi Rao, and Eibe Frank.
# Xgboost: Scalable GPU accelerated learning. arXiv:1806.11248, 2018.

# #Train set = 53%, test set = 20%, valid set = 26%
# train_val_indices, test_indices = train_test_split(
#     range(n_total), test_size=0.2, random_state=0)
# train_indices, valid_indices = train_test_split(
#     train_val_indices, test_size=0.2 / 0.6, random_state=0) #valid split = 33%(0.2/0.6)
    
# 0.1, 0.1 / 0.8 - Train set = 78%, test set = 10%, valid set = 11%
train_val_indices, test_indices = train_test_split(
    range(n_total), test_size=0.1, random_state=0)
train_indices, valid_indices = train_test_split(
    train_val_indices, test_size=0.1 / 0.8, random_state=0) #valid split = 11%(0.1/0.8)

## Network parameters

In [None]:
# clf = TabNetClassifier(
#     n_d=64, n_a=64, n_steps=5,
#     gamma=1.5, n_independent=2, n_shared=2,
#     cat_emb_dim=1,
#     lambda_sparse=1e-4, momentum=0.3, clip_value=2.,
#     optimizer_fn=torch.optim.Adam,
#     optimizer_params=dict(lr=2e-2),
#     scheduler_params = {"gamma": 0.95,
#                      "step_size": 20},
#     scheduler_fn=torch.optim.lr_scheduler.StepLR, epsilon=1e-15
# )

clf = TabNetClassifier(
    n_d=64, n_a=64, n_steps=5,
    gamma=1.5, n_independent=2, n_shared=2,
    cat_idxs=cat_idxs,
    cat_dims=cat_dims,
    cat_emb_dim=2,
    lambda_sparse=1e-4, momentum=0.3, clip_value=2.,
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    scheduler_params = {"gamma": 0.95,
                     "step_size": 20},
    scheduler_fn=torch.optim.lr_scheduler.StepLR, epsilon=1e-15,
    mask_type='entmax' # sparsemax
)


## Training

In [None]:
if os.getenv("CI", False):
# Take only a subsample to run CI
    X_train = df_sampled[feature_columns].values[train_indices][:1000,:]
    y_train = df_sampled[target].values[train_indices][:1000]
else:
    X_train = df_sampled[feature_columns].values[train_indices]
    y_train = df_sampled[target].values[train_indices]

X_valid = df_sampled[feature_columns].values[valid_indices]
y_valid = df_sampled[target].values[valid_indices]

X_test = df_sampled[feature_columns].values[test_indices]
y_test = df_sampled[target].values[test_indices]

## ======================================

## 2. Load Train & Test data frame
## Use Features obtained from Leave-one-out-target encoding

In [None]:
target = "veranst_segment"
# feature_columns = column.features_v6  #80 features
feature_columns = column.features_v7  #77 features

df_train = pd.read_parquet(column.train_data_path)
df_test = pd.read_parquet(column.test_data_path)

In [None]:
le = LabelEncoder()

df_train['tarif_bez'] = le.fit_transform(df_train['tarif_bez'])
df_test['tarif_bez'] = le.fit_transform(df_test['tarif_bez'])

In [None]:
#Train set = 53%, test set = 20%, valid set = 26%
df_train, df_valid = train_test_split(
    df_train, test_size=0.2 / 0.6, random_state=0) #valid split = 33%(0.2/0.6)

### Input featuers - 78;
### Target feature - 1

In [None]:
# not_input_features = ['veranst_segment','vg_inkasso']

# X_train = df_train.loc[:,~df_train.columns.isin(not_input_features)].values
# y_train = df_train[target].values

# X_valid = df_valid.loc[:,~df_valid.columns.isin(not_input_features)].values
# y_valid = df_valid[target].values

# X_test = df_test.loc[:,~df_test.columns.isin(not_input_features)].values
# y_test = df_test[target].values

## 77 input features - without tarif
X_train = df_train[feature_columns].values
y_train = df_train[target].values

X_valid = df_valid[feature_columns].values
y_valid = df_valid[target].values

X_test = df_test[feature_columns].values
y_test = df_test[target].values

## Network parameters

In [None]:
clf = TabNetClassifier(
    n_d=64, n_a=64, n_steps=5,
    gamma=1.5, n_independent=2, n_shared=2,
    cat_emb_dim=1,
    lambda_sparse=1e-4, momentum=0.3, clip_value=2.,
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    scheduler_params = {"gamma": 0.95,
                     "step_size": 20},
    scheduler_fn=torch.optim.lr_scheduler.StepLR, epsilon=1e-15
)

## ======================================

## ===============================================

## 3. Load Train & Test data frame
## Use Features obtained from Leave-one-out-target encoding
## Perform 2 binary classification

We keep only features + 'veranst_segment' columns.

Encode segments for Classifier 1:
* Variable y_clf1 
* Positive class (seg 4+) y_clf1=1
* Negative class (seg 2 or 3) y_clf1=0

Encode segments for Classifier 2: 
* Variable y_clf2
* Positive class (seg 3) y_clf2=1
* Negative class (seg 2) y_clf2=0
* Segments >3 are "encoded" as np.NaN. These NaN values will be dropped before training

In [None]:
# target = "veranst_segment"
# feature_columns = column.features_v7  #77 features

# df_train = pd.read_parquet(column.train_data_path)
# df_test = pd.read_parquet(column.test_data_path)

In [None]:
# #Train set = 53%, test set = 20%, valid set = 26%
# df_train, df_valid = train_test_split(
#     df_train, test_size=0.2 / 0.6, random_state=0) #valid split = 33%(0.2/0.6)

In [None]:
target = "veranst_segment"
feature_columns = column.features_v7  #77 features # without tarif

data_path = column.data_path_2016_2020_v4
df = pd.read_parquet(data_path)

class_frequency = df.groupby('veranst_segment')['veranst_segment'].transform('count')
# df_sampled = df.sample(n=70000, weights=class_frequency, random_state=2)
df_sampled = df.sample(n=300000, weights=class_frequency, random_state=2)

# df_sampled = df_sampled[feature_columns]
# df_sampled = df[feature_columns]
# df_sampled = df.copy()

### Define Categorical features for categorical embeddings

In [None]:
nunique = df_sampled.nunique()
types = df_sampled.dtypes

categorical_columns = []
categorical_dims =  {}
for col in df_sampled.columns:
    if types[col] == 'object' or nunique[col] < 200:
        print(col, df_sampled[col].nunique())
        l_enc = LabelEncoder()
        df_sampled[col] = l_enc.fit_transform(df_sampled[col].values)
        categorical_columns.append(col)
        categorical_dims[col] = len(l_enc.classes_)

In [None]:
cat_idxs = [ i for i, f in enumerate(feature_columns) if f in categorical_columns]
cat_dims = [ categorical_dims[f] for i, f in enumerate(feature_columns) if f in categorical_columns]
print(cat_idxs)
print(cat_dims)

In [None]:
df_sampled.loc[:,'y_clf1']=(df_sampled.veranst_segment.astype(int) > 1).values.astype(int)
df_sampled.loc[:,'y_clf2']=df_sampled.veranst_segment.apply(lambda x: 1 if x==1 else (0 if x==0 else np.nan))

In [None]:
n_total = len(df_sampled)

# Train, val and test split follows
# Rory Mitchell, Andrey Adinets, Thejaswi Rao, and Eibe Frank.
# Xgboost: Scalable GPU accelerated learning. arXiv:1806.11248, 2018.

# #Train set = 53%, test set = 20%, valid set = 26%
# train_val_indices, test_indices = train_test_split(
#     range(n_total), test_size=0.2, random_state=0)
# train_indices, valid_indices = train_test_split(
#     train_val_indices, test_size=0.2 / 0.6, random_state=0) #valid split = 33%(0.2/0.6)

# 0.1, 0.1 / 0.8 - Train set = 78%, test set = 10%, valid set = 11%
train_val_indices, test_indices = train_test_split(
    range(n_total), test_size=0.1, random_state=0)
train_indices, valid_indices = train_test_split(
    train_val_indices, test_size=0.1 / 0.8, random_state=0) #valid split = 11%(0.1/0.8)

## Plot Losses and Accuracy

In [None]:
def plot_loss_accuracy(model):
    # plot losses
    plt.plot(model.history['loss'])

    # plot accuracy
    plt.plot(model.history['train_auc'])
    plt.plot(model.history['valid_auc'])

## Test Prediction 

In [None]:
def test_prediction(model, input, target):
        # To get final results you may need to use a mapping for classes 
        # as you are allowed to use targets like ["yes", "no", "maybe", "I don't know"]

        dataset_name = 'Intellizenz'
        # preds_mapper = { idx : class_name for idx, class_name in enumerate(model.classes_)}

        preds = model.predict_proba(input)

        # y_pred = np.vectorize(preds_mapper.get)(np.argmax(preds, axis=1))
        y_pred = (np.argmax(preds, axis=1))

        test_acc = accuracy_score(y_pred=y_pred, y_true=target)

        # print(f"BEST VALID SCORE FOR {dataset_name} : {model.best_cost}")
        print(f"FINAL TEST SCORE FOR {dataset_name} : {test_acc}")

        # or you can simply use the predict method
        y_pred = model.predict(input)
        test_acc = accuracy_score(y_pred=y_pred, y_true=target)
        print(f"FINAL TEST SCORE FOR {dataset_name} : {test_acc}")

        return preds, y_pred, test_acc

## Save Model

In [None]:
def save_model(model, path):
    # save state dict
    # saving_path_name = "./baseline_tabnet_model_test_1"
    saved_filename = model.save_model(path)

def load_model(path):
    # define new model and load save parameters
    loaded_clf_model = TabNetClassifier()
    loaded_clf_model.load_model(path)
    return loaded_clf_model

## Wandb Logging

In [None]:
def wand_log(model, pred_probas, y_test, y_pred, test_accuracy):
    wandb.init(project="Intellizenz", entity="elsaravana")
    wandb.config = {
        "learning_rate": 0.02,
        "epochs": 200,
        "batch_size": 16384
    }

    wandb.log({"conf_mat" : wandb.plot.confusion_matrix(probs=None,
                        preds=y_pred, y_true=y_test,
                        class_names=[0, 1])})
    wandb.log({"pr" : wandb.plot.pr_curve(y_true=y_test, y_probas=pred_probas,
                labels=['Segment 0-50€ or 50-100€', 'Segment >100€'], classes_to_plot=[0, 1])})
    wandb.log({"roc" : wandb.plot.roc_curve(y_true=y_test, y_probas=pred_probas,
                    labels=['Segment 0-50€ or 50-100€', 'Segment >100€'], classes_to_plot=[0, 1])})

    train_loss = model.history['loss']
    # train_accuracy = model.history['train_accuracy']
    # validation_accuracy = model.history['valid_accuracy']

    train_accuracy = model.history['train_auc']
    validation_accuracy = model.history['valid_auc']
    
    for i,loss in enumerate(train_loss):
        wandb.log({"train_loss": loss, 
            # "train_accuracy": train_accuracy[i],
            # "validation_accuracy": validation_accuracy[i]})
            "train_auc": train_accuracy[i],
            "validation_auc": validation_accuracy[i]})

    wandb.log({"test_accuracy": test_accuracy})
    # exit_code 0, to finish a successful run
    wandb.finish(0)

### Classification 1

In [None]:
# ## 77 input features - without tarif
# X_train_clf1 = df_train[feature_columns].values
# y_train_clf1 = df_train.y_clf1

# X_valid_clf1 = df_valid[feature_columns].values
# y_valid_clf1 = df_valid.y_clf1

# X_test_clf1 = df_test[feature_columns].values
# y_test_clf1 = df_test.y_clf1

In [None]:
## 77 input features - without tarif
X_train_clf1 = df_sampled[feature_columns].values[train_indices]
y_train_clf1 = df_sampled['y_clf1'].values[train_indices]

X_valid_clf1 = df_sampled[feature_columns].values[valid_indices]
y_valid_clf1 = df_sampled['y_clf1'].values[valid_indices]

X_test_clf1 = df_sampled[feature_columns].values[test_indices]
y_test_clf1 = df_sampled['y_clf1'].values[test_indices]

In [None]:
y_train_clf1

In [None]:
## Network parameters
clf1 = TabNetClassifier(
    n_d=64, n_a=64, n_steps=5,
    gamma=1.5, n_independent=2, n_shared=2,
    cat_emb_dim=1,
    lambda_sparse=1e-4, momentum=0.3, clip_value=2.,
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    scheduler_params = {"gamma": 0.95,
                     "step_size": 20},
    scheduler_fn=torch.optim.lr_scheduler.StepLR, epsilon=1e-15
)

# clf1 = TabNetClassifier(
#     n_d=64, n_a=64, n_steps=5,
#     gamma=1.5, n_independent=2, n_shared=2,
#     cat_idxs=cat_idxs,
#     cat_dims=cat_dims,
#     cat_emb_dim=2,
#     lambda_sparse=1e-4, momentum=0.3, clip_value=2.,
#     optimizer_fn=torch.optim.Adam,
#     optimizer_params=dict(lr=2e-2),
#     scheduler_params = {"gamma": 0.95,
#                      "step_size": 20},
#     scheduler_fn=torch.optim.lr_scheduler.StepLR, epsilon=1e-15,
#     mask_type='entmax' # sparsemax
# )

max_epochs = 200 if not os.getenv("CI", False) else 2

aug = ClassificationSMOTE(p=0.2)
#SMOTE - Synthetic Minority Oversampling Technique
clf1.fit(
    X_train=X_train_clf1, y_train=y_train_clf1,
    eval_set=[(X_valid_clf1, y_valid_clf1), (X_valid_clf1, y_valid_clf1)],
    eval_name=['train', 'valid'],
    max_epochs=max_epochs, patience=100,
    batch_size=16384, virtual_batch_size=256, 
    #batch_size can be 1-10% of whole training dataset size
    #Training - 1.3M, 1% - 13,000, 10% - 130,000 
    augmentations=aug
)

### Classification 1 save model, plot, visualization

In [None]:
plot_loss_accuracy(clf1)

In [None]:
pred_probas, y_pred, test_acc = test_prediction(clf1, X_test_clf1, y_test_clf1)

In [None]:
y_pred

In [None]:
wand_log(model=clf1,pred_probas=pred_probas, y_test= y_test_clf1, y_pred=y_pred, test_accuracy=test_acc)

In [None]:
save_model(clf1,path="./baseline_tabnet_model_test_1")

In [None]:
# clf_1 = load_model(path="./baseline_tabnet_model_test_1")

In [None]:
plot_loss_accuracy(clf1)
pred_probas, y_pred, test_acc = test_prediction(clf1, X_test_clf1, y_test_clf1)

wand_log(model=clf1,pred_probas=pred_probas, y_test= y_test_clf1, y_pred=y_pred, test_accuracy=test_acc)

### Classification 2

In [None]:
# # For Classifier2 (clf2) we use data from segments 2 and 3 (so we drop records where y_clf2 is NaN)
# df_train_clf2 = df_train.dropna(subset=['y_clf2'])
# df_valid_clf2 = df_valid.dropna(subset=['y_clf2'])
# df_test_clf2 = df_test.dropna(subset=['y_clf2'])


# ## 77 input features - without tarif
# X_train_clf2 = df_train_clf2[feature_columns].values
# y_train_clf2 = df_train_clf2.y_clf2

# X_valid_clf2 = df_valid_clf2[feature_columns].values
# y_valid_clf2 = df_valid_clf2.y_clf2

# X_test_clf2 = df_test_clf2[feature_columns].values
# y_test_clf2 = df_test_clf2.y_clf2

In [None]:
# For Classifier2 (clf2) we use data from segments 2 and 3 (so we drop records where y_clf2 is NaN)
df_sampled_clf2 = df_sampled.dropna(subset=['y_clf2'])

In [None]:
nunique_clf2 = df_sampled_clf2.nunique()
types_clf2 = df_sampled_clf2.dtypes

categorical_columns_clf2 = []
categorical_dims_clf2 =  {}
for col in df_sampled_clf2.columns:
    if types_clf2[col] == 'object' or nunique_clf2[col] < 200:
        print(col, df_sampled_clf2[col].nunique())
        l_enc = LabelEncoder()
        df_sampled_clf2[col] = l_enc.fit_transform(df_sampled_clf2[col].values)
        categorical_columns_clf2.append(col)
        categorical_dims_clf2[col] = len(l_enc.classes_)

In [None]:
cat_idxs_clf2 = [ i for i, f in enumerate(feature_columns) if f in categorical_columns_clf2]
cat_dims_clf2 = [ categorical_dims_clf2[f] for i, f in enumerate(feature_columns) if f in categorical_columns_clf2]
print(cat_idxs_clf2)
print(cat_dims_clf2)

In [None]:
n_total_clf2 = len(df_sampled_clf2)

# Train, val and test split follows
# Rory Mitchell, Andrey Adinets, Thejaswi Rao, and Eibe Frank.
# Xgboost: Scalable GPU accelerated learning. arXiv:1806.11248, 2018.

# #Train set = 53%, test set = 20%, valid set = 26%
# train_val_indices_clf2, test_indices_clf2 = train_test_split(
#     range(n_total_clf2), test_size=0.2, random_state=0)
# train_indices_clf2, valid_indices_clf2 = train_test_split(
#     train_val_indices_clf2, test_size=0.2 / 0.6, random_state=0) #valid split = 33%(0.2/0.6)

# 0.1, 0.1 / 0.8 - Train set = 78%, test set = 10%, valid set = 11%
train_val_indices_clf2, test_indices_clf2 = train_test_split(
    range(n_total_clf2), test_size=0.1, random_state=0)
train_indices_clf2, valid_indices_clf2 = train_test_split(
    train_val_indices_clf2, test_size=0.1 / 0.8, random_state=0) #valid split = 11%(0.1/0.8)

In [None]:
X_train_clf2 = df_sampled_clf2[feature_columns].values[train_indices_clf2]
y_train_clf2 = df_sampled_clf2['y_clf2'].values[train_indices_clf2]

X_valid_clf2 = df_sampled_clf2[feature_columns].values[valid_indices_clf2]
y_valid_clf2 = df_sampled_clf2['y_clf2'].values[valid_indices_clf2]

X_test_clf2 = df_sampled_clf2[feature_columns].values[test_indices_clf2]
y_test_clf2 = df_sampled_clf2['y_clf2'].values[test_indices_clf2]

In [None]:
## Network parameters
# clf2 = TabNetClassifier(
#     n_d=64, n_a=64, n_steps=5,
#     gamma=1.5, n_independent=2, n_shared=2,
#     cat_emb_dim=1,
#     lambda_sparse=1e-4, momentum=0.3, clip_value=2.,
#     optimizer_fn=torch.optim.Adam,
#     optimizer_params=dict(lr=2e-2),
#     scheduler_params = {"gamma": 0.95,
#                      "step_size": 20},
#     scheduler_fn=torch.optim.lr_scheduler.StepLR, epsilon=1e-15
# )

clf2 = TabNetClassifier(
     n_d=64, n_a=64, n_steps=5,
     gamma=1.5, n_independent=2, n_shared=2,
     cat_idxs=cat_idxs_clf2,
     cat_dims=cat_dims_clf2,
     cat_emb_dim=2,
     lambda_sparse=1e-4, momentum=0.3, clip_value=2.,
     optimizer_fn=torch.optim.Adam,
     optimizer_params=dict(lr=2e-2),
     scheduler_params = {"gamma": 0.95,
                      "step_size": 20},
     scheduler_fn=torch.optim.lr_scheduler.StepLR, epsilon=1e-15,
     mask_type='entmax' # sparsemax
)

max_epochs = 200 if not os.getenv("CI", False) else 2

aug = ClassificationSMOTE(p=0.2)
#SMOTE - Synthetic Minority Oversampling Technique
clf2.fit(
    X_train=X_train_clf2, y_train=y_train_clf2,
    eval_set=[(X_valid_clf2, y_valid_clf2), (X_valid_clf2, y_valid_clf2)],
    eval_name=['train', 'valid'],
    max_epochs=max_epochs, patience=100,
    batch_size=16384, virtual_batch_size=256,
    augmentations=aug
)

In [None]:
pred_probas_clf2, y_pred_clf2, test_acc_clf2 = test_prediction(clf2, X_test_clf2, y_test_clf2)

In [None]:
wand_log(model=clf2,pred_probas=pred_probas_clf2, y_test= y_test_clf2, y_pred=y_pred_clf2, test_accuracy=test_acc_clf2)

### Classification 2 save model, plot, visualization

In [None]:
save_model(clf2,path="./baseline_tabnet_model_test_1")

In [None]:
clf2 = load_model(path="./baseline_tabnet_model_test_1.zip")

# C:/Users/sgopalakrish/Downloads/intellizenz-model-training/Neuro-symbolic-AI/SLASH/TabNet/baseline_tabnet_model_test_1.zip

In [None]:
clf2

In [None]:
# plot_loss_accuracy(clf2)
pred_probas, y_pred, test_acc = test_prediction(clf2, X_test_clf2, y_test_clf2)

# wand_log(model=clf2,pred_probas=pred_probas, y_test= y_test_clf2, y_pred=y_pred, test_accuracy=test_acc)

## ===============================================

## ============================================================================================================

In [None]:
max_epochs = 200 if not os.getenv("CI", False) else 2

In [None]:
print(len(X_train))
print(len(y_train))

In [None]:
x1 = torch.Tensor([-5])
print((-x1).pow(2))
print(-(x1).pow(2))

In [None]:
aug = ClassificationSMOTE(p=0.2)
#SMOTE - Synthetic Minority Oversampling Technique
clf.fit(
    X_train=X_train, y_train=y_train,
    eval_set=[(X_train, y_train), (X_valid, y_valid)],
    eval_name=['train', 'valid'],
    # max_epochs=max_epochs, patience=100,
    max_epochs=max_epochs, patience=100,
    batch_size=16384, virtual_batch_size=256,
    augmentations=aug
)

In [None]:
# plot losses
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.plot(clf.history['loss'])

In [None]:
# plot accuracy
plt.plot(clf.history['train_accuracy'])
plt.plot(clf.history['valid_accuracy'])

### Predictions

In [None]:
dataset_name = 'Intellizenz'

In [None]:
# To get final results you may need to use a mapping for classes 
# as you are allowed to use targets like ["yes", "no", "maybe", "I don't know"]

preds_mapper = { idx : class_name for idx, class_name in enumerate(clf.classes_)}

preds = clf.predict_proba(X_test)

y_pred = np.vectorize(preds_mapper.get)(np.argmax(preds, axis=1))

test_acc = accuracy_score(y_pred=y_pred, y_true=y_test)

print(f"BEST VALID SCORE FOR {dataset_name} : {clf.best_cost}")
print(f"FINAL TEST SCORE FOR {dataset_name} : {test_acc}")

In [None]:
# or you can simply use the predict method

y_pred = clf.predict(X_test)
test_acc = accuracy_score(y_pred=y_pred, y_true=y_test)
print(f"FINAL TEST SCORE FOR {dataset_name} : {test_acc}")

## Classification report

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_test, y_pred, labels=[0, 1, 2]))

In [None]:
# print(classification_report(d_true, d_pred, labels=[0, 1, 2]))
## Classification report of tabnet model training on 300k data with 78 features.

## Save and load Model

In [None]:
# save state dict
# saving_path_name = "./baseline_tabnet_model_test_1"

saving_path_name = "./baseline_tabnet_model_clf_d300k_140feat_lr_0.02_ep200"
saved_filename = clf.save_model(saving_path_name)

In [None]:
saving_path_name = 'baseline_tabnet_model_clf_d300k_78feat_lr_0.02_ep200'

In [None]:
# define new model and load save parameters
loaded_clf = TabNetClassifier()
loaded_clf.load_model(saved_filename)

In [None]:
y_pred = loaded_clf.predict(X_test)

In [None]:
loaded_preds = loaded_clf.predict_proba(X_test)
loaded_y_pred = np.vectorize(preds_mapper.get)(np.argmax(loaded_preds, axis=1))

loaded_test_acc = accuracy_score(y_pred=loaded_y_pred, y_true=y_test)

print(f"FINAL TEST SCORE FOR {dataset_name} : {loaded_test_acc}")

In [None]:
import wandb

wandb.init(project="Intellizenz", entity="elsaravana")
wandb.config = {
    "learning_rate": 0.02,
    "epochs": 200,
    "batch_size": 16384
}

wandb.log({"conf_mat" : wandb.plot.confusion_matrix(probs=None,
                    preds=loaded_y_pred, y_true=y_test,
                    class_names=[0, 1, 2])})
wandb.log({"pr" : wandb.plot.pr_curve(y_true=y_test, y_probas=loaded_preds,
             labels=['Segment 0-50€', 'Segment 50-100€', 'Segment >100€'], classes_to_plot=[0, 1, 2])})
wandb.log({"roc" : wandb.plot.roc_curve(y_true=y_test, y_probas=loaded_preds,
                labels=['Segment 0-50€', 'Segment 50-100€', 'Segment >100€'], classes_to_plot=[0, 1, 2])})

train_loss = clf.history['loss']
train_accuracy = clf.history['train_accuracy']
validation_accuracy = clf.history['valid_accuracy']
for i,loss in enumerate(train_loss):
    wandb.log({"train_loss": loss, 
        "train_accuracy": train_accuracy[i],
        "validation_accuracy": validation_accuracy[i]})

In [None]:
# wandb.summary["test_accuracy"] = loaded_test_acc
wandb.log({"test_accuracy": loaded_test_acc})

In [None]:
assert(test_acc == loaded_test_acc)

In [None]:
# exit_code 0, to finish a successful run
wandb.finish(0)

## Global explainability: feature importance summing to 1

In [None]:
clf.feature_importances_

## Local explainability and masks

In [None]:
explain_matrix, masks = clf.explain(X_test)

In [None]:
fig, axs = plt.subplots(1, 5, figsize=(20,20))

for i in range(5):
    axs[i].imshow(masks[i][:50])
    axs[i].set_title(f"mask {i}")

### Plot PR curve

In [None]:
from sklearn.preprocessing import label_binarize
from sklearn.metrics import precision_recall_curve,roc_curve

n_classes = 3

In [None]:
# Convert the True labels and Prediction to One hot encoded representation such as: if true - 0, then [1 0 0], 
# if true - 1, then [0 1 0]
y_true_binarize = label_binarize(y_test, classes=[*range(n_classes)])
y_pred_binarize = label_binarize(y_pred, classes=[*range(n_classes)])

In [None]:
# precision recall curve
precision = dict()
recall = dict()
for i in range(n_classes):
    label = ''
    if i == 0:
        label = 'Class 0(0-50€)'
    elif i == 1:
        label = 'Class 1(50-100€)'
    else:
        label = 'Class 2(>100€)'

    precision[i], recall[i], _ = precision_recall_curve(y_true_binarize[:, i],
                                                        y_pred_binarize[:, i])
    plt.plot(recall[i], precision[i], lw=2, label=label)
    
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.legend(loc="best")
# plt.title("Precision vs recall curve")
plt.show()


## ROC Curve

In [None]:
# ROC curve
fpr = dict()
tpr = dict()
for i in range(n_classes):
    label = ''
    if i == 0:
        label = 'Class 0(0-50€)'
    elif i == 1:
        label = 'Class 1(50-100€)'
    else:
        label = 'Class 2(>100€)'

    fpr[i], tpr[i], _ = roc_curve(y_true_binarize[:, i],
                                  y_pred_binarize[:, i])
    plt.plot(fpr[i], tpr[i], lw=2, label=label)
    
plt.xlabel("False positive rate")
plt.ylabel("True positive rate")
plt.legend(loc="best")
# plt.title("Precision vs recall curve")
plt.show()

In [None]:
from sklearn.metrics import multilabel_confusion_matrix, confusion_matrix

# mul_c = multilabel_confusion_matrix(y_true_binarize, y_pred_binarize, labels=['Class 0(0-50€)', 'Class 1(50-100€)', 'Class 2(>100€)'])
# mul_c


In [None]:
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index = ['Class 0(0-50€)', 'Class 1(50-100€)', 'Class 2(>100€)'], columns= ['Class 0(0-50€)', 'Class 1(50-100€)', 'Class 2(>100€)'])


In [None]:
import seaborn as sns

In [None]:
plt.figure(figsize=(5,4))
sns.heatmap(cm_df, annot=True)
plt.xlabel('Actual values')
plt.ylabel('Predicted values')
plt.show()

## SLASH Plots
### Tabnet + SLASH - 140 features

In [2]:
import pandas as pd
from sklearn.metrics import classification_report

In [None]:
# Classification report of SLASH +TabNet model trained on 300,000 data with 140 features
conf_mat_df = pd.read_csv('140_features_slash_tabnet_conf_matr.csv')

actual = conf_mat_df['Actual']
pred = conf_mat_df['Predicted']
count = conf_mat_df['nPredictions']

true_labels = []
pred_labels = []

for id, label in enumerate(actual):
    each_true_lbl = label
    each_pred_lbl = pred[id]
    for i in range(count[id]):
        true_labels.append(each_true_lbl)
        pred_labels.append(each_pred_lbl)

print(classification_report(true_labels, pred_labels, labels=[0, 1, 2]))

In [6]:
# Classification report of SLASH +TabNet model trained on 300,000 data with 78 features
# conf_mat_df = pd.read_csv('78_features_slash_tabnet_conf_matr.csv')

# Classification report of SLASH +TabNet model trained on normalized 78 features
# conf_mat_df = pd.read_csv('78_norm_features_slash_tabnet_d5k_ep30_conf_matr.csv')
conf_mat_df = pd.read_csv('78_norm_features_slash_tabnet_d1_5k_ep30_conf_matr.csv')

actual = conf_mat_df['Actual']
pred = conf_mat_df['Predicted']
count = conf_mat_df['nPredictions']

true_labels = []
pred_labels = []

for id, label in enumerate(actual):
    each_true_lbl = label
    each_pred_lbl = pred[id]
    for i in range(count[id]):
        true_labels.append(each_true_lbl)
        pred_labels.append(each_pred_lbl)

print(classification_report(true_labels, pred_labels, labels=[0, 1, 2]))

              precision    recall  f1-score   support

           0       0.44      0.27      0.34        95
           1       0.34      0.77      0.47        92
           2       0.55      0.06      0.12        93

    accuracy                           0.37       280
   macro avg       0.44      0.37      0.31       280
weighted avg       0.44      0.37      0.31       280



## NeurASP Plots
### 2 Hidden Layer MLP + NeurASP

In [3]:
# Classification report of 2 Hidden Layer + NeurASP model trained on 300,000 data with 78 features
# conf_mat_df = pd.read_csv('78_features_neurasp_2hl_mlp_conf_matr.csv')
conf_mat_df = pd.read_csv('78_norm_features_neurasp_2hl_mlp_event_tarif_rule_conf_matr.csv')


actual = conf_mat_df['Actual']
pred = conf_mat_df['Predicted']
count = conf_mat_df['nPredictions']

true_labels = []
pred_labels = []

for id, label in enumerate(actual):
    each_true_lbl = label
    each_pred_lbl = pred[id]
    for i in range(count[id]):
        true_labels.append(each_true_lbl)
        pred_labels.append(each_pred_lbl)

print(classification_report(true_labels, pred_labels, labels=[0, 1, 2]))

              precision    recall  f1-score   support

           0       0.79      0.78      0.78     20258
           1       0.69      0.63      0.66     19959
           2       0.77      0.85      0.81     19783

    accuracy                           0.75     60000
   macro avg       0.75      0.75      0.75     60000
weighted avg       0.75      0.75      0.75     60000



In [None]:
# Classification report of 2 Hidden Layer + NeurASP model trained on 300,000 data with 140 features
conf_mat_df = pd.read_csv('140_features_neurasp_2hl_mlp_conf_matr.csv')

actual = conf_mat_df['Actual']
pred = conf_mat_df['Predicted']
count = conf_mat_df['nPredictions']

true_labels = []
pred_labels = []

for id, label in enumerate(actual):
    each_true_lbl = label
    each_pred_lbl = pred[id]
    for i in range(count[id]):
        true_labels.append(each_true_lbl)
        pred_labels.append(each_pred_lbl)

print(classification_report(true_labels, pred_labels, labels=[0, 1, 2]))