In [1]:
from pytorch_tabnet.tab_model import TabNetClassifier
from pytorch_tabnet.pretraining import TabNetPretrainer
from sklearn.metrics import roc_curve, roc_auc_score
import numpy as np
import pandas as pd
import os.path as osp

In [2]:
data_dir = '../data/'
df = pd.read_csv('../data/processed/train.csv', index_col='PassengerId')
df

Unnamed: 0_level_0,age,fare,sibsp,parch,pclass,sex,embarked,age_group,familysz,survived
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,22.000000,7.2500,1.0,0.0,2,1,2,4,1,0
2,38.000000,71.2833,1.0,0.0,0,0,0,0,1,1
3,26.000000,7.9250,0.0,0.0,2,0,2,4,1,1
4,35.000000,53.1000,1.0,0.0,0,0,2,0,1,1
5,35.000000,8.0500,0.0,0.0,2,1,2,0,1,0
...,...,...,...,...,...,...,...,...,...,...
886,39.000000,29.1250,0.0,5.0,2,0,1,0,0,0
888,19.000000,30.0000,0.0,0.0,0,0,2,3,1,1
889,19.091437,23.4500,1.0,2.0,2,0,2,3,0,0
890,26.000000,30.0000,0.0,0.0,0,1,0,4,1,1


In [3]:
cat_dims = [df[col].nunique() for col in df.columns[4:]]
print(cat_dims)
cat_idxs = list(range(4, len(df.columns)))

[3, 2, 3, 5, 2, 2]


In [4]:
df['survived'].value_counts(normalize=True)

0    0.583333
1    0.416667
Name: survived, dtype: float64

# Complejidad 3 (30%)

## 12. Desarrolle un modelo que permita obtener los valores de la variable objetivo

In [5]:
import pandas as pd
from sklearn.model_selection import KFold
import numpy as np
import os.path as osp
from argparse import ArgumentParser, Namespace

RANDOM_STATE = hash("fuck yeah!") % (2 ** 32 - 1)
print(RANDOM_STATE)


train_path = osp.join('../data/processed/', "train.csv")
train_df = pd.read_csv(train_path, index_col="PassengerId")
y = train_df["survived"].values
X = train_df.drop(["survived"], axis=1).values
kf = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

val_mean_aucs = []
train_mean_aucs = []
best_epochs = []
best_model = None
best_val_auc = 0

for train_index, test_index in kf.split(X):
    X_train, X_val = X[train_index], X[test_index]
    y_train, y_val = y[train_index], y[test_index]
    pretrainer = TabNetPretrainer(seed=RANDOM_STATE)
    pretrainer.fit(
        X_train, 
        eval_set=[X_train, X_val], 
        eval_name=['train', 'val']
    )
    model = TabNetClassifier(
        seed=RANDOM_STATE,
    )
    model.fit(
        X_train, y_train, 
        eval_set=[(X_train, y_train), (X_val, y_val)], 
        eval_name=['train', 'val'],
        from_unsupervised=pretrainer
    )
    valpreds = model.predict_proba(X_val)[:, 1]
    trainpreds = model.predict_proba(X_train)[:, 1]
    
    val_auc = roc_auc_score(y_val, valpreds)
    train_auc = roc_auc_score(y_train, trainpreds)
    
    if val_auc > best_val_auc:
        best_model = model
        best_val_auc = val_auc
    
    val_mean_aucs.append(val_auc)
    train_mean_aucs.append(train_auc)
    best_epochs.append(model.best_epoch)

2785828985
Device used : cpu
epoch 0  | loss: 5.77944 | train_unsup_loss: 161.8956| val_unsup_loss: 111.24139|  0:00:00s
epoch 1  | loss: 4.6224  | train_unsup_loss: 32.93577| val_unsup_loss: 26.372  |  0:00:00s
epoch 2  | loss: 3.81354 | train_unsup_loss: 22.22789| val_unsup_loss: 14.74843|  0:00:00s
epoch 3  | loss: 2.87812 | train_unsup_loss: 15.39487| val_unsup_loss: 16.60109|  0:00:00s
epoch 4  | loss: 2.76621 | train_unsup_loss: 12.34333| val_unsup_loss: 9.80925 |  0:00:00s
epoch 5  | loss: 2.53548 | train_unsup_loss: 11.56439| val_unsup_loss: 7.98727 |  0:00:00s
epoch 6  | loss: 2.33322 | train_unsup_loss: 8.9364  | val_unsup_loss: 5.58788 |  0:00:01s
epoch 7  | loss: 2.07655 | train_unsup_loss: 4.7163  | val_unsup_loss: 4.00575 |  0:00:01s
epoch 8  | loss: 2.04682 | train_unsup_loss: 4.25163 | val_unsup_loss: 3.60495 |  0:00:01s
epoch 9  | loss: 2.01675 | train_unsup_loss: 3.58374 | val_unsup_loss: 3.00165 |  0:00:01s
epoch 10 | loss: 1.78976 | train_unsup_loss: 4.80753 | val_u

In [6]:
train_mean_aucs = np.array(train_mean_aucs)
val_mean_aucs = np.array(val_mean_aucs)
best_epochs = np.array(best_epochs)

for be, t, v in zip(best_epochs, train_mean_aucs, val_mean_aucs):
    print(f"Best epoch: {be} | train AUC {t} | val AUC {v}")

Best epoch: 34 | train AUC 0.8288465007215007 | val AUC 0.7727840199750312
Best epoch: 16 | train AUC 0.7177703081232494 | val AUC 0.802014652014652
Best epoch: 19 | train AUC 0.8160555270381861 | val AUC 0.8274436090225564
Best epoch: 5 | train AUC 0.7559170588636519 | val AUC 0.7841068917018283
Best epoch: 34 | train AUC 0.8310261777747766 | val AUC 0.7953484195402298


In [7]:
print(train_mean_aucs.mean())
print(val_mean_aucs.mean())

0.7899231145042729
0.7963395184508596


In [8]:
preds = best_model.predict_proba(X)[:, 1]
auc = roc_auc_score(y, preds)
print(f"AUC = {auc}")
fpr, tpr, thr = roc_curve(y, preds)
optimial_thr = thr[np.argmax(tpr - fpr)]
print(optimial_thr)

AUC = 0.8160565813627036
0.4924383


In [9]:
y_hat = (preds > optimial_thr).astype(int)
(y_hat == y).sum() / len(y)

0.7777777777777778

In [10]:
name = 'best_model_train_auc=81_acc=77'
best_model.save_model(name)
loaded_model = TabNetClassifier()
loaded_model.load_model(f"{name}.zip")
loaded_model

Successfully saved model at best_model_train_auc=81_acc=77.zip
Device used : cpu
Device used : cpu


TabNetClassifier(n_d=8, n_a=8, n_steps=3, gamma=1.3, cat_idxs=[], cat_dims=[], cat_emb_dim=1, n_independent=2, n_shared=2, epsilon=1e-15, momentum=0.02, lambda_sparse=0.001, seed=2785828985, clip_value=1, verbose=1, optimizer_fn=<class 'torch.optim.adam.Adam'>, optimizer_params={'lr': 0.02}, scheduler_fn=None, scheduler_params={}, mask_type='sparsemax', input_dim=9, output_dim=2, device_name='auto')

In [11]:
test_df = pd.read_csv(osp.join('../data/processed/', "test.csv"), index_col="PassengerId")
X_test = test_df.values
probs = loaded_model.predict_proba(X_test)[:, 1]
y_hat = (probs > optimial_thr).astype(int)

In [12]:
pd.DataFrame(
    data={
        'PassengerId': test_df.index,
        'Survived': y_hat,
    },
).to_csv('submission.csv', index=False)