This **Code** is mainly based on [https://www.kaggle.com/abhishek/same-old-entity-embeddings](https://www.kaggle.com/abhishek/same-old-entity-embeddings) and the submission from [https://www.kaggle.com/craigmthomas/tps-mar-2021-stacked-starter](https://www.kaggle.com/craigmthomas/tps-mar-2021-stacked-starter) was used for blend,

In [1]:
from tensorflow.keras import layers, optimizers, callbacks, utils, losses, metrics, backend as K
from sklearn import metrics as skmetrics, preprocessing
from tensorflow.keras.models import Model, load_model
from sklearn.model_selection import StratifiedKFold
from scipy.stats import rankdata
import os, gc, joblib, warnings
import tensorflow_addons as tfa
import tensorflow as tf
import pandas as pd
import numpy as np

warnings.filterwarnings('ignore')

In [2]:
def create_model(data, catcols):    
    inputs = []
    outputs = []
    for c in catcols:
        num_unique_values = int(data[c].nunique())
        embed_dim = int(min(np.ceil((num_unique_values)/2), 20))
        inp = layers.Input(shape=(1,))
        out = layers.Embedding(num_unique_values + 1, embed_dim, name=c)(inp)
        out = layers.SpatialDropout1D(0.25)(out)
        out = layers.Reshape(target_shape=(embed_dim, ))(out)
        inputs.append(inp)
        outputs.append(out)
    
    x = layers.Concatenate()(outputs)
    x = layers.BatchNormalization()(x)
    
    x = layers.Dense(300, activation='relu')(x)
    x = layers.Dropout(0.3)(x)
    x = layers.BatchNormalization()(x)
    
    x = layers.Dense(300, activation='relu')(x)
    x = layers.Dropout(0.3)(x)
    x = layers.BatchNormalization()(x)
    
    y = layers.Dense(1, activation='sigmoid')(x)

    model = Model(inputs=inputs, outputs=y)
    return model

In [3]:
train = pd.read_csv('../input/tabular-playground-series-mar-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-mar-2021/test.csv')


In [4]:
test_id = test.id.values
train = train.drop(['id'], axis=1)
test = test.drop(['id'], axis=1)

In [5]:
sparse_features = [f for f in train.columns if 'cat' in f]
dense_features = [feat for feat in train.columns if  feat not in sparse_features+['target']]

for col in sparse_features:
    train_only = list(set(train[col].unique()) - set(test[col].unique()))
    test_only = list(set(test[col].unique()) - set(train[col].unique()))
    both = list(set(test[col].unique()).union(set(train[col].unique())))
    train.loc[train[col].isin(train_only), col] = np.nan
    test.loc[test[col].isin(test_only), col] = np.nan
    mode = train[col].mode().values[0]
    train[col] = train[col].fillna(mode)
    test[col] = test[col].fillna(mode)
    

In [6]:
for feat in dense_features:
    test[feat] = np.clip(test[feat], train[feat].min(), train[feat].max())

In [7]:
test["target"] = -1
data = pd.concat([train, test]).reset_index(drop=True)

for c in dense_features:
    data[f'q_{c}'], bins_ = pd.qcut(data[c], 25, retbins=True, labels=[i for i in range(25)])
    data[f'q_{c}'] = data[f'q_{c}'].astype('str')
    sparse_features.append(f'q_{c}')

In [8]:
features = sparse_features
for feat in features:
    lbl_enc = preprocessing.OrdinalEncoder()
    data[feat] = lbl_enc.fit_transform(data[feat].fillna('-1').values.reshape(-1,1).astype(str))
    
train = data[data.target != -1].reset_index(drop=True)
test = data[data.target == -1].reset_index(drop=True)
test_data = [test.loc[:, features].values[:, k] for k in range(test.loc[:, features].values.shape[1])]

In [9]:
oof_preds = np.zeros((len(train)))
bagged_oof_preds = np.zeros((len(train)))
test_preds = np.zeros((len(test)))
bagged_test_preds = np.zeros((len(test)))
learning_rate = 1e-3
label_smoothing = 0.0
Verbose = 0
n_splits = [10, 15]
n_bags = 2
seeds = [2021, 2021]

In [10]:
for bag in range(n_bags):
    print(f'Iteration {bag+1} splits {n_splits[bag]} seed {seeds[bag]}')
    for fold, (train_index, test_index) in enumerate(StratifiedKFold(n_splits=n_splits[bag], shuffle=True, random_state=seeds[bag]).split(train, train.target.values)):
        X_train, X_test = train.iloc[train_index, :], train.iloc[test_index, :]
        X_train = X_train.reset_index(drop=True)
        X_test = X_test.reset_index(drop=True)
        y_train, y_test = X_train.target.values, X_test.target.values
        model = create_model(data, features)
        model.compile(
            optimizer=tfa.optimizers.SWA(tf.keras.optimizers.Adam(learning_rate=learning_rate)),
            loss=losses.BinaryCrossentropy(label_smoothing=label_smoothing),
            metrics=metrics.AUC(name="AUC"),
        )

        X_train = [X_train.loc[:, features].values[:, k] for k in range(X_train.loc[:, features].values.shape[1])]
        X_test = [X_test.loc[:, features].values[:, k] for k in range(X_test.loc[:, features].values.shape[1])]
    
        es = callbacks.EarlyStopping(monitor='val_AUC', min_delta=0.000001, patience=10, verbose=Verbose, mode='max', baseline=None, restore_best_weights=True)
        sb = callbacks.ModelCheckpoint('./nn_model.w8', save_weights_only=True, save_best_only=True, verbose=Verbose, monitor='val_AUC',mode='max')
        plateau  = callbacks.ReduceLROnPlateau(monitor='val_AUC', factor=0.5, patience=2, verbose=Verbose,
                                        mode='max', min_delta=0.0001, cooldown=0, min_lr=1e-7)
        model.fit(X_train,
                  y_train,
                  validation_data=(X_test, y_test),
                  verbose=Verbose,
                  batch_size=1024,
                  callbacks=[es, sb, plateau],
                  epochs=100
             )
        valid_fold_preds = model.predict(X_test)
        test_fold_preds = model.predict(test_data)
        oof_preds[test_index] = rankdata(valid_fold_preds.ravel())/len(X_test)
        test_preds += rankdata(test_fold_preds.ravel() / n_splits[bag])/len(test)
        print(f'fold {fold+1} AUC : {skmetrics.roc_auc_score(y_test, valid_fold_preds)}')
        K.clear_session()
    print(f'Overall AUC of Iteration {bag+1} = {skmetrics.roc_auc_score(train.target.values, oof_preds)}')
    np.save(f'oof_preds_{bag}',oof_preds)
    np.save(f'test_preds_{bag}',test_preds)
    bagged_test_preds += test_preds / n_bags
    bagged_oof_preds += oof_preds / n_bags

Iteration 1 splits 10 seed 2021
fold 1 AUC : 0.8961918170969543
fold 2 AUC : 0.8943718985440651
fold 3 AUC : 0.8928650772326882
fold 4 AUC : 0.8919611653395161
fold 5 AUC : 0.8940623021939308
fold 6 AUC : 0.8941781197838289
fold 7 AUC : 0.8950134547216109
fold 8 AUC : 0.8942711573274676
fold 9 AUC : 0.8903416792377293
fold 10 AUC : 0.8930819030632879
Overall AUC of Iteration 1 = 0.8936338522666155
Iteration 2 splits 15 seed 2021
fold 1 AUC : 0.8949696084929532
fold 2 AUC : 0.8966754501061742
fold 3 AUC : 0.8939437589461702
fold 4 AUC : 0.8912614695776577
fold 5 AUC : 0.8951690774548317
fold 6 AUC : 0.8903163962934546
fold 7 AUC : 0.8923044990067063
fold 8 AUC : 0.8948867136350268
fold 9 AUC : 0.8969503860022942
fold 10 AUC : 0.8945787660007172
fold 11 AUC : 0.8991855767627449
fold 12 AUC : 0.8917752848473403
fold 13 AUC : 0.892725469255858
fold 14 AUC : 0.8911767833170773
fold 15 AUC : 0.8932440965471888
Overall AUC of Iteration 2 = 0.8939442133469299


In [11]:
print("Overall AUC={}".format(skmetrics.roc_auc_score(train.target.values, bagged_oof_preds)))

Overall AUC=0.8947215641553808


In [12]:
print('Saving submission file')
submission = pd.DataFrame.from_dict({
    'id': test_id,
    'target': bagged_test_preds,
})
submission.to_csv('submission.csv', index=False)
submission.head(3)

Saving submission file


Unnamed: 0,id,target
0,5,8.508183
1,6,13.826033
2,8,1.334588


In [13]:
top_public = pd.read_csv('/kaggle/input/tps-mar-2021-stacked-starter/submission.csv')
submission['target'] = (rankdata(submission.target) * 0.275 + rankdata(top_public.target) * 0.725)/len(submission)
submission.to_csv('blend.csv', index=False)
submission.head(3)

Unnamed: 0,id,target
0,5,0.469261
1,6,0.766515
2,8,0.062625
