In [1]:
from pytorch_tabnet.tab_model import TabNetClassifier
from pytorch_tabnet.pretraining import TabNetPretrainer
from sklearn.metrics import roc_curve, roc_auc_score
import numpy as np
import pandas as pd
import os.path as osp

In [2]:
data_dir = '../data/'
df = pd.read_csv('../data/processed/train.csv', index_col='PassengerId')
df

Unnamed: 0_level_0,age,fare,sibsp,parch,pclass,sex,embarked,age_group,familysz,survived
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,22.000000,7.2500,1.0,0.0,2,1,2,4,1,0
2,38.000000,71.2833,1.0,0.0,0,0,0,0,1,1
3,26.000000,7.9250,0.0,0.0,2,0,2,4,1,1
4,35.000000,53.1000,1.0,0.0,0,0,2,0,1,1
5,35.000000,8.0500,0.0,0.0,2,1,2,0,1,0
...,...,...,...,...,...,...,...,...,...,...
886,39.000000,29.1250,0.0,5.0,2,0,1,0,0,0
888,19.000000,30.0000,0.0,0.0,0,0,2,3,1,1
889,19.091437,23.4500,1.0,2.0,2,0,2,3,0,0
890,26.000000,30.0000,0.0,0.0,0,1,0,4,1,1


In [3]:
cat_dims = [df[col].nunique() for col in df.columns[4:]]
print(cat_dims)
cat_idxs = list(range(4, len(df.columns)))

[3, 2, 3, 5, 2, 2]


In [4]:
df['survived'].value_counts(normalize=True)

0    0.583333
1    0.416667
Name: survived, dtype: float64

# Complejidad 3 (30%)

## 12. Desarrolle un modelo que permita obtener los valores de la variable objetivo

In [21]:
import pandas as pd
from sklearn.model_selection import KFold
import numpy as np
import os.path as osp
from argparse import ArgumentParser, Namespace

RANDOM_STATE = hash("fuck yeah!") % (2 ** 32 - 1)
print(RANDOM_STATE)


train_path = osp.join('../data/processed/', "train.csv")
train_df = pd.read_csv(train_path, index_col="PassengerId")
y = train_df["survived"].values
X = train_df.drop(["survived"], axis=1).values
kf = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

val_mean_aucs = []
train_mean_aucs = []
best_epochs = []
best_model = None
best_val_auc = 0

for train_index, test_index in kf.split(X):
    X_train, X_val = X[train_index], X[test_index]
    y_train, y_val = y[train_index], y[test_index]
    model = TabNetClassifier(
        seed=RANDOM_STATE,
    )
    model.fit(
        X_train, y_train, 
        eval_set=[(X_train, y_train), (X_val, y_val)], 
        eval_name=['train', 'val'],
    )
    valpreds = model.predict_proba(X_val)[:, 1]
    trainpreds = model.predict_proba(X_train)[:, 1]
    
    val_auc = roc_auc_score(y_val, valpreds)
    train_auc = roc_auc_score(y_train, trainpreds)
    
    if val_auc > best_val_auc:
        best_model = model
        best_val_auc = val_auc
    
    val_mean_aucs.append(val_auc)
    train_mean_aucs.append(train_auc)
    best_epochs.append(model.best_epoch)

534413108
Device used : cpu
epoch 0  | loss: 0.83448 | train_auc: 0.46237 | val_auc: 0.54022 |  0:00:00s
epoch 1  | loss: 0.7376  | train_auc: 0.47446 | val_auc: 0.55707 |  0:00:00s
epoch 2  | loss: 0.65959 | train_auc: 0.51665 | val_auc: 0.56105 |  0:00:00s
epoch 3  | loss: 0.59701 | train_auc: 0.60955 | val_auc: 0.69973 |  0:00:00s
epoch 4  | loss: 0.57807 | train_auc: 0.67045 | val_auc: 0.70217 |  0:00:00s
epoch 5  | loss: 0.5771  | train_auc: 0.66585 | val_auc: 0.7231  |  0:00:00s
epoch 6  | loss: 0.58472 | train_auc: 0.6701  | val_auc: 0.7279  |  0:00:01s
epoch 7  | loss: 0.57493 | train_auc: 0.71573 | val_auc: 0.76721 |  0:00:01s
epoch 8  | loss: 0.55263 | train_auc: 0.73805 | val_auc: 0.776   |  0:00:01s
epoch 9  | loss: 0.52712 | train_auc: 0.73134 | val_auc: 0.77609 |  0:00:01s
epoch 10 | loss: 0.51324 | train_auc: 0.74324 | val_auc: 0.77591 |  0:00:01s
epoch 11 | loss: 0.5298  | train_auc: 0.75177 | val_auc: 0.78225 |  0:00:01s
epoch 12 | loss: 0.5236  | train_auc: 0.74924 | 

In [26]:
train_mean_aucs = np.array(train_mean_aucs)
val_mean_aucs = np.array(val_mean_aucs)
best_epochs = np.array(best_epochs)

for be, t, v in zip(best_epochs, train_mean_aucs, val_mean_aucs):
    print(f"Best epoch: {be} | train AUC {t} | val AUC {v}")

Best epoch: 15 | train AUC 0.7640485420529244 | val AUC 0.7896739130434784
Best epoch: 20 | train AUC 0.7421988918051909 | val AUC 0.6693078324225865
Best epoch: 28 | train AUC 0.8093996239255014 | val AUC 0.8657885040530583
Best epoch: 22 | train AUC 0.7588784303026168 | val AUC 0.7280239099859352
Best epoch: 8 | train AUC 0.7504327982373309 | val AUC 0.8167388167388168


In [28]:
print(train_mean_aucs.mean())
print(val_mean_aucs.mean())

0.7649916572647129
0.7739065952487751


In [51]:
preds = best_model.predict_proba(X)[:, 1]
auc = roc_auc_score(y, preds)
print(f"AUC = {auc}")
fpr, tpr, thr = roc_curve(y, preds)
optimial_thr = thr[np.argmax(tpr - fpr)]
print(optimial_thr)

AUC = 0.8205845301083395
0.10058495


In [52]:
y_hat = (preds > optimial_thr).astype(int)
(y_hat == y).sum() / len(y)

0.7632275132275133

In [59]:
name = 'best_model_train_auc=82'
best_model.save_model(name)
loaded_model = TabNetClassifier()
loaded_model.load_model(f"{name}.zip")
loaded_model

Successfully saved model at best_model_train_auc=82.zip
Device used : cpu
Device used : cpu


TabNetClassifier(n_d=8, n_a=8, n_steps=3, gamma=1.3, cat_idxs=[], cat_dims=[], cat_emb_dim=1, n_independent=2, n_shared=2, epsilon=1e-15, momentum=0.02, lambda_sparse=0.001, seed=534413108, clip_value=1, verbose=1, optimizer_fn=<class 'torch.optim.adam.Adam'>, optimizer_params={'lr': 0.02}, scheduler_fn=None, scheduler_params={}, mask_type='sparsemax', input_dim=9, output_dim=2, device_name='auto')

In [60]:
test_df = pd.read_csv(osp.join('../data/processed/', "test.csv"), index_col="PassengerId")
X_test = test_df.values
probs = loaded_model.predict_proba(X_test)[:, 1]
y_hat = (probs > optimial_thr).astype(int)

In [61]:
pd.DataFrame(
    data={
        'y_pred': y_hat,
    },
    index=test_df.index
).to_csv('submission.csv')