# EXPERIMENT .... LET'S MAKE A TEST ON CATEGORICAL ENCODED DATA



### This notebook provides way to encode all data using NN (Embedding). Then I build model using LightAutoML.

In [1]:
import numpy as np
import pandas as pd
import plotly.express as px

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import column_or_1d
from sklearn.metrics import log_loss


import tensorflow as tf
from keras.models import Model
from keras.layers import Dense, Flatten, Dropout, BatchNormalization, Embedding, Input
from keras.layers.merge import concatenate
from keras.utils import to_categorical

from tqdm.notebook import tqdm

import warnings
warnings.filterwarnings("ignore")

In [2]:
train = pd.read_csv("../input/tabular-playground-series-may-2021/train.csv", index_col = 'id')
test = pd.read_csv("../input/tabular-playground-series-may-2021/test.csv", index_col = 'id')

X = train.drop('target', axis = 1)

lencoder = LabelEncoder()
y = pd.DataFrame(lencoder.fit_transform(train['target']), columns=['target'])

df_all = pd.concat([X, test], axis = 0)
df_all = df_all.astype("category")

# PART 1. CATEGORICAL ENCODING

In [3]:
class __LabelEncoder__(LabelEncoder):

    def transform(self, y):

        check_is_fitted(self, 'classes_')
        y = column_or_1d(y, warn=True)

        unseen = len(self.classes_)

        e = np.array([
                     np.searchsorted(self.classes_, x)
                     if x in self.classes_ else unseen
                     for x in y
                     ])

        if unseen in e:
            self.classes_ = np.array(self.classes_.tolist() + ['unseen'])

        return e

def get_encoded_data(data, categorical_variables=None):
   
    encoders = {}

    df = data.copy()

    if categorical_variables is None:
        categorical_variables = [col for col in df.columns if df[col].dtype == 'category']

    for var in categorical_variables:
        encoders[var] = __LabelEncoder__()
        df.loc[:, var] = encoders[var].fit_transform(df[var])

    return df.astype("category"), encoders

In [4]:
df_all, encoders = get_encoded_data(df_all)
train, test, y = df_all[:len(train)].to_numpy(), df_all[len(train):].to_numpy(), y.to_numpy()

X_train, X_test, y_train, y_test = train_test_split(train, y, test_size=0.2, random_state=1)

In [5]:
X_train_enc = [X_train[:, i] for i in range(X_train.shape[1])]
X_test_enc = [X_test[:, i] for i in range(X_test.shape[1])]
test_enc = [test[:, i] for i in range(test.shape[1])]

y_train_enc = to_categorical(y_train)
y_test_enc = to_categorical(y_test)

X_train_enc[0:5]

[array([0, 0, 0, ..., 0, 1, 0]),
 array([0, 0, 0, ..., 0, 0, 0]),
 array([0, 1, 0, ..., 0, 0, 0]),
 array([0, 0, 0, ..., 0, 1, 0]),
 array([ 0,  0, 23, ...,  0,  0,  1])]

In [6]:
categorical_variables = df_all.select_dtypes(include='category').columns
info = {col:(df_all[col].nunique(),min(50,(df_all[col].nunique()+ 1) //2)) for col in categorical_variables}

### For coding we will use Embedding leayer (3 embeddings for each categorical feature)

In [7]:
input_layers = list()
embedding_layers = list()

for feature in categorical_variables:
    n_labels = df_all[feature].nunique()
    input_layer = Input(shape=(1,))
    embedding_layer = Embedding(n_labels, 3)(input_layer)
    input_layers.append(input_layer)
    embedding_layers.append(embedding_layer)
    
merge = concatenate(embedding_layers)

dense_1 = Dense(128, kernel_initializer='normal', activation='relu')(merge)
x = BatchNormalization()(dense_1)
x = Dropout(0.5)(x)
dense_2 = Dense(32, kernel_initializer='normal', activation='relu')(x)
x = BatchNormalization()(dense_2)
x = Dropout(0.25)(x)
flatten = Flatten()(x)
output = Dense(4, activation='softmax')(flatten)
model = Model(inputs=input_layers, outputs=output)

In [8]:
model.compile(loss = "categorical_crossentropy", optimizer = tf.keras.optimizers.Adam(), metrics=['accuracy'])

In [9]:
model.fit(X_train_enc, y_train_enc, validation_data=(X_test_enc, y_test_enc), epochs=20, batch_size=64, verbose=2)

Epoch 1/20
1250/1250 - 26s - loss: 1.2075 - accuracy: 0.5313 - val_loss: 1.1032 - val_accuracy: 0.5751
Epoch 2/20
1250/1250 - 19s - loss: 1.1060 - accuracy: 0.5767 - val_loss: 1.0990 - val_accuracy: 0.5770
Epoch 3/20
1250/1250 - 20s - loss: 1.0971 - accuracy: 0.5772 - val_loss: 1.0978 - val_accuracy: 0.5761
Epoch 4/20
1250/1250 - 19s - loss: 1.0941 - accuracy: 0.5781 - val_loss: 1.0976 - val_accuracy: 0.5770
Epoch 5/20
1250/1250 - 19s - loss: 1.0919 - accuracy: 0.5787 - val_loss: 1.0941 - val_accuracy: 0.5773
Epoch 6/20
1250/1250 - 20s - loss: 1.0895 - accuracy: 0.5802 - val_loss: 1.0964 - val_accuracy: 0.5765
Epoch 7/20
1250/1250 - 19s - loss: 1.0882 - accuracy: 0.5802 - val_loss: 1.0971 - val_accuracy: 0.5785
Epoch 8/20
1250/1250 - 20s - loss: 1.0871 - accuracy: 0.5806 - val_loss: 1.0954 - val_accuracy: 0.5770
Epoch 9/20
1250/1250 - 19s - loss: 1.0856 - accuracy: 0.5811 - val_loss: 1.0958 - val_accuracy: 0.5770
Epoch 10/20
1250/1250 - 20s - loss: 1.0842 - accuracy: 0.5812 - val_loss:

<tensorflow.python.keras.callbacks.History at 0x7ff3ac708190>

In [10]:
embs = list(map(lambda x: x.get_weights()[0], [x for x in model.layers if 'Embedding' in str(x)]))
embeddings = {var: emb for var, emb in zip(info.keys(), embs)}

In [11]:
embeddings_df = {}
for cat_var in tqdm(embeddings.keys()):
    df = pd.DataFrame(embeddings[cat_var])
    df.index = encoders[cat_var].classes_
    df.columns = [cat_var +  '_embedding_' + str(num) for num in df.columns]
    embeddings_df[cat_var] = df

  0%|          | 0/50 [00:00<?, ?it/s]

### Let's look into feature_0 coded 

In [12]:
embeddings_df['feature_0'].head(5)

Unnamed: 0,feature_0_embedding_0,feature_0_embedding_1,feature_0_embedding_2
0,-0.084046,-0.132137,0.001385
1,-0.05603,0.137385,0.057242
2,0.049321,-0.031253,-0.053845
3,-0.004126,-0.08428,-0.146389
4,0.143491,0.110588,-0.031996


In [13]:
embeddings_df['feature_1'].head(5)

Unnamed: 0,feature_1_embedding_0,feature_1_embedding_1,feature_1_embedding_2
0,0.021667,-0.177334,-0.024177
1,-0.04734,0.028615,0.103602
2,-0.047948,0.174049,-0.011884
3,0.030107,0.117597,-0.022041
4,0.047781,0.123179,0.168474


### Let's look into graphical representation of feature embedding

In [14]:
fig = px.scatter_3d(embeddings_df['feature_3'], x='feature_3_embedding_0', y='feature_3_embedding_1', z='feature_3_embedding_2', color =embeddings_df['feature_3'].index)
fig.show()

In [15]:
fig = px.scatter_3d(embeddings_df['feature_35'], x='feature_35_embedding_0', y='feature_35_embedding_1', z='feature_35_embedding_2', color =embeddings_df['feature_35'].index)
fig.show()

In [16]:
fig = px.scatter_3d(embeddings_df['feature_18'], x='feature_18_embedding_0', y='feature_18_embedding_1', z='feature_18_embedding_2', color =embeddings_df['feature_18'].index)
fig.show()

### Let's encode input data into embedding values

In [17]:
def fit_transform(data, embeddings, encoders, drop_categorical_vars=False):

    dfs={}
    for cat_var in tqdm(embeddings.keys()):
        df = pd.DataFrame(embeddings[cat_var])
        df.index = encoders[cat_var].classes_
        df.columns = [cat_var +  '_embedding_' + str(num) for num in df.columns]
        data = data.merge(df, how='left', left_on=cat_var, right_index=True)

    if drop_categorical_vars:
        return data.drop(list(embeddings.keys()), axis=1)
    else:
        return data

df_categorical_coded = fit_transform(df_all, embeddings, encoders, True)
train_categorical_coded, test_categorical_coded= df_categorical_coded[:len(train)], df_categorical_coded[len(train):]
train_categorical_coded['target'] = y

  0%|          | 0/50 [00:00<?, ?it/s]

In [18]:
train_categorical_coded.head(5)

Unnamed: 0_level_0,feature_0_embedding_0,feature_0_embedding_1,feature_0_embedding_2,feature_1_embedding_0,feature_1_embedding_1,feature_1_embedding_2,feature_2_embedding_0,feature_2_embedding_1,feature_2_embedding_2,feature_3_embedding_0,...,feature_47_embedding_0,feature_47_embedding_1,feature_47_embedding_2,feature_48_embedding_0,feature_48_embedding_1,feature_48_embedding_2,feature_49_embedding_0,feature_49_embedding_1,feature_49_embedding_2,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,-0.084046,-0.132137,0.001385,0.021667,-0.177334,-0.024177,-0.111526,-0.063427,0.160287,0.003858,...,-0.088006,0.040383,-0.084138,-0.051489,-0.035701,-0.09957,0.107528,-0.002068,0.002917,1
1,-0.084046,-0.132137,0.001385,0.021667,-0.177334,-0.024177,0.163006,-0.022721,-0.125094,0.003858,...,-0.088006,0.040383,-0.084138,-0.051489,-0.035701,-0.09957,0.107528,-0.002068,0.002917,0
2,-0.084046,-0.132137,0.001385,0.021667,-0.177334,-0.024177,0.163006,-0.022721,-0.125094,0.003858,...,0.284729,-0.177793,0.32295,-0.009157,-0.035696,-0.06095,0.107528,-0.002068,0.002917,0
3,-0.084046,-0.132137,0.001385,0.021667,-0.177334,-0.024177,0.163006,-0.022721,-0.125094,0.003858,...,-0.088006,0.040383,-0.084138,0.015703,-0.075244,-0.039871,0.107528,-0.002068,0.002917,3
4,-0.084046,-0.132137,0.001385,0.021667,-0.177334,-0.024177,0.163006,-0.022721,-0.125094,0.003858,...,-0.088006,0.040383,-0.084138,0.015703,-0.075244,-0.039871,0.107528,-0.002068,0.002917,1


In [19]:
test_categorical_coded.head(5)

Unnamed: 0_level_0,feature_0_embedding_0,feature_0_embedding_1,feature_0_embedding_2,feature_1_embedding_0,feature_1_embedding_1,feature_1_embedding_2,feature_2_embedding_0,feature_2_embedding_1,feature_2_embedding_2,feature_3_embedding_0,...,feature_46_embedding_2,feature_47_embedding_0,feature_47_embedding_1,feature_47_embedding_2,feature_48_embedding_0,feature_48_embedding_1,feature_48_embedding_2,feature_49_embedding_0,feature_49_embedding_1,feature_49_embedding_2
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100000,-0.084046,-0.132137,0.001385,0.021667,-0.177334,-0.024177,0.163006,-0.022721,-0.125094,0.003858,...,-0.082183,-0.088006,0.040383,-0.084138,-0.051489,-0.035701,-0.09957,0.107528,-0.002068,0.002917
100001,-0.084046,-0.132137,0.001385,0.021667,-0.177334,-0.024177,-0.111526,-0.063427,0.160287,0.003858,...,0.158674,-0.088006,0.040383,-0.084138,-0.009157,-0.035696,-0.06095,0.083531,-0.041591,0.019696
100002,-0.084046,-0.132137,0.001385,0.021667,-0.177334,-0.024177,0.163006,-0.022721,-0.125094,0.003858,...,-0.082183,-0.088006,0.040383,-0.084138,-0.049514,-0.136669,-0.121888,0.107528,-0.002068,0.002917
100003,-0.084046,-0.132137,0.001385,0.021667,-0.177334,-0.024177,0.163006,-0.022721,-0.125094,0.003858,...,0.131358,0.132386,-0.266956,0.08591,-0.065098,-0.317084,0.132901,-0.018634,0.169554,-0.000248
100004,-0.084046,-0.132137,0.001385,0.021667,-0.177334,-0.024177,0.163006,-0.022721,-0.125094,0.003858,...,-0.082183,-0.088006,0.040383,-0.084138,-0.051489,-0.035701,-0.09957,0.107528,-0.002068,0.002917


In [20]:
train_categorical_coded.to_csv("tps-05-train_categorical_coded.csv")
test_categorical_coded.to_csv("tps-05-test_categorical_coded.csv")

# PART 2. Autoencoder on Tabular Data


From post in comments - Alexander Ryzhkov:
> What are the fix variants which can help:
> 1) Try not to use target while you create embeddings - you can use autoencoder for that
> 2) If you want to use the target, you can do it based on cross-validation, but in this situation you can use only OOF predictions instead of categorical embeddings because for 2 different runs on k-1 folds embeddings for sure do not have same columns to concat them vertically.

In [21]:
from sklearn.preprocessing import MinMaxScaler 

scaler = MinMaxScaler()

X_train = scaler.fit_transform(df_all[:len(train)])
X_validation = scaler.transform(df_all[len(train):])

In [22]:
# Let's define simple AutoEncoder

encoding_dim = 40

input_size = len(df_all.columns)

input_df = Input(shape=(input_size,))
x = Dense(32, kernel_initializer='normal', activation='relu')(input_df)
encoded = Dense(encoding_dim, activation='relu')(x)
x = Dense(32, kernel_initializer='normal', activation='relu')(encoded)
decoded = Dense(input_size, activation='sigmoid')(x)

autoencoder = Model(input_df, decoded)

autoencoder.compile(optimizer='adadelta', loss='mean_squared_error')

autoencoder.fit(X_train, X_train,
                epochs=250,
                batch_size=256,
                shuffle=True,
                validation_data=(X_validation, X_validation))

Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 26/250
Epoch 27/250
Epoch 28/250
Epoch 29/250
Epoch 30/250
Epoch 31/250
Epoch 32/250
Epoch 33/250
Epoch 34/250
Epoch 35/250
Epoch 36/250
Epoch 37/250
Epoch 38/250
Epoch 39/250
Epoch 40/250
Epoch 41/250
Epoch 42/250
Epoch 43/250
Epoch 44/250
Epoch 45/250
Epoch 46/250
Epoch 47/250
Epoch 48/250
Epoch 49/250
Epoch 50/250
Epoch 51/250
Epoch 52/250
Epoch 53/250
Epoch 54/250
Epoch 55/250
Epoch 56/250
Epoch 57/250
Epoch 58/250
Epoch 59/250
Epoch 60/250
Epoch 61/250
Epoch 62/250
Epoch 63/250
Epoch 64/250
Epoch 65/250
Epoch 66/250
Epoch 67/250
Epoch 68/250
Epoch 69/250
Epoch 70/250
Epoch 71/250
Epoch 72/250
Epoch 73/250
Epoch 74/250
Epoch 75/250
Epoch 76/250
Epoch 77/250
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7ff37ce1d690>

In [23]:
encoder = Model(input_df, encoded)



auto_enc_X_train = pd.DataFrame(encoder.predict(df_all[:len(train)]))
auto_enc_X_train.columns = ['f_' + str(num) for num in auto_enc_X_train.columns]
auto_enc_X_train['target'] =  y
auto_enc_test = pd.DataFrame(encoder.predict(df_all[len(train):]))
auto_enc_test.columns = ['f_' + str(num) for num in auto_enc_test.columns]

In [24]:
auto_enc_X_train.head(5)

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,...,f_31,f_32,f_33,f_34,f_35,f_36,f_37,f_38,f_39,target
0,0.0,0.555975,3.272596,1.505373,2.46719,1.757412,1.934829,0.704205,2.613808,0.0,...,0.0,3.501558,0.0,0.0,2.650594,0.047986,0.89053,0.0,0.0,1
1,0.0,0.956875,1.903848,0.895021,1.996428,0.720968,1.040129,0.614494,2.369416,0.0,...,0.0,1.866915,0.0,0.0,1.111554,0.001715,0.259031,0.0,0.217043,0
2,0.0,1.334568,1.570528,0.539364,1.992273,1.304327,1.075451,0.0,2.441133,0.437408,...,0.0,1.273411,0.0,0.0,1.769437,0.0,0.0,0.0,0.0,0
3,0.0,1.260125,3.219783,0.869997,3.111172,1.749663,1.396731,0.431052,3.288814,0.0,...,0.0,2.880494,0.0,0.0,1.883665,0.0,0.0,0.0,1.137641,3
4,0.0,0.687631,1.812418,0.588929,2.029459,0.925855,1.116901,0.199536,3.078559,0.0,...,0.0,1.469665,0.0,0.0,1.509623,0.007083,0.372583,0.0,0.392597,1


In [25]:
auto_enc_test.head(5)

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,...,f_30,f_31,f_32,f_33,f_34,f_35,f_36,f_37,f_38,f_39
0,0.0,1.223427,2.679945,0.898021,2.68488,1.998158,1.28472,0.386677,2.702899,0.0,...,0.0,0.0,3.01562,0.0,0.0,1.458722,0.03932,0.10174,0.0,0.526759
1,0.0,1.945553,3.087953,1.391811,3.101813,0.999694,1.713151,0.42629,3.487511,0.0,...,0.0,0.0,3.418587,0.0,0.0,1.780039,0.028047,0.094209,0.0,0.504167
2,0.0,3.980857,2.431899,1.262034,3.417264,0.708282,0.965427,1.805205,3.721495,0.0,...,0.0,0.0,4.118979,0.0,0.0,2.010127,0.455232,0.0,0.0,1.351588
3,0.0,1.069883,2.810406,1.126441,2.641521,0.494284,1.245421,1.055062,2.993886,0.0,...,0.0,0.0,1.674796,0.0,0.0,0.894351,0.0,0.0,0.0,0.900717
4,0.0,2.143997,2.210137,0.942007,2.22656,1.581595,1.047793,0.001433,2.766952,0.0,...,0.0,0.0,2.767054,0.0,0.0,2.15221,0.329523,0.100778,0.0,0.617638


In [26]:
auto_enc_X_train.to_csv("tps-05-train_autoencoder-40_coded.csv")
auto_enc_test.to_csv("tps-05-test_autoencoder-40_coded.csv")

# LightAUTOML test

In [27]:
# Train LightAutoML on AutoEncoder or Categorical Encoding

AUTO_ENCODER = True

In [28]:
if AUTO_ENCODER:
    train_LightAutoML = auto_enc_X_train
    test_LightAutoML = auto_enc_test
else:
    train_LightAutoML = train_categorical_coded
    test_LightAutoML = test_categorical_coded

In [29]:
pip install -U lightautoml -q

Note: you may need to restart the kernel to use updated packages.


In [30]:
from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.tasks import Task

import pandas as pd

In [31]:
N_THREADS = 4 # threads cnt for lgbm and linear models
N_FOLDS = 5 # folds cnt for AutoML
RANDOM_STATE = 42 # fixed random state for various reasons
TEST_SIZE = 0.2 # Test size for metric check
TIMEOUT = 4 * 3600 # Time in seconds for automl run
TARGET_NAME = 'target'

In [32]:
task = Task('multiclass',)

roles = {
    'target': TARGET_NAME,
    'drop': ['id'],
}

In [33]:
automl = TabularUtilizedAutoML(task = task, 
                               timeout = TIMEOUT,
                               cpu_limit = N_THREADS,
                               reader_params = {'n_jobs': N_THREADS})
oof_pred = automl.fit_predict(train_LightAutoML, roles = roles)
print('oof_pred:\n{}\nShape = {}'.format(oof_pred[:10], oof_pred.shape))

Current random state: {'reader_params': {'random_state': 42}, 'general_params': {'return_all_predictions': False}}
Found reader_params in kwargs, need to combine
Merged variant for reader_params = {'n_jobs': 4, 'random_state': 42}
Start automl preset with listed constraints:
- time: 14399.99601149559 seconds
- cpus: 4 cores
- memory: 16 gb

Train data shape: (100000, 41)
Feats was rejected during automatic roles guess: []


Layer 1 ...
Train process start. Time left 14367.123786211014 secs
Start fitting Lvl_0_Pipe_0_Mod_0_LinearL2 ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_LinearL2 =====

Linear model: C = 1e-05 score = -1.1168366664156317
Linear model: C = 5e-05 score = -1.1162448940724135
Linear model: C = 0.0001 score = -1.1161141209617258
Linear model: C = 0.0005 score = -1.116110008496046
Linear model: C = 0.001 score = -1.1161877275675536
Linear model: C = 0.005 score = -1.1161868405535817

===== Start working with fold 1 for Lvl_0_Pipe_0_Mod_0_LinearL2 =====

Li

Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_1_Mod_1_LightGBM =====

Training until validation scores don't improve for 200 rounds
[100]	valid's multi_logloss: 1.1159
[200]	valid's multi_logloss: 1.11896
Early stopping, best iteration is:
[47]	valid's multi_logloss: 1.11478
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed
Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_1_Mod_1_LightGBM =====

Training until validation scores don't improve for 200 rounds
[100]	valid's multi_logloss: 1.11573
[200]	valid's multi_logloss: 1.11895
Early stopping, best iteration is:
[51]	valid's multi_logloss: 1.11492
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed
Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_1_Mod_1_LightGBM =====

Training until validation scores don't improve for 200 rounds
[100]	valid's multi_logloss: 1.11446
[200]	valid'

In [34]:
test_pred = automl.predict(test_LightAutoML)
print('Prediction for test data:\n{}\nShape = {}'.format(test_pred[:10], test_pred.shape))

print('Check scores...')
print('OOF score: {}'.format(log_loss(train_LightAutoML[TARGET_NAME].values, oof_pred.data)))

Prediction for test data:
array([[0.08763607, 0.5486574 , 0.23936714, 0.1243394 ],
       [0.0878091 , 0.61484355, 0.19707932, 0.10026807],
       [0.07607333, 0.59588987, 0.2127921 , 0.11524472],
       [0.08315544, 0.54995537, 0.2530763 , 0.11381294],
       [0.08148254, 0.5804419 , 0.22069538, 0.11738019],
       [0.08325876, 0.5577323 , 0.22356576, 0.13544317],
       [0.08125921, 0.5300908 , 0.2590876 , 0.12956241],
       [0.08333592, 0.5999717 , 0.17515057, 0.14154185],
       [0.08789296, 0.54389113, 0.24614538, 0.12207057],
       [0.08153614, 0.57234436, 0.22276488, 0.12335466]], dtype=float32)
Shape = (50000, 4)
Check scores...
OOF score: 1.1116566297978163


In [35]:
submission = pd.read_csv('../input/tabular-playground-series-may-2021/sample_submission.csv')

submission.iloc[:, 1:] = test_pred.data
submission.to_csv('Autoencoder+EMB+LigtAutoml.csv', index = False)

In [36]:
submission.drop("id", axis=1).describe().T.style.bar(subset=['mean'], color='#205ff2')\
                            .background_gradient(subset=['std'], cmap='Reds')\
                            .background_gradient(subset=['50%'], cmap='coolwarm')

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Class_1,50000.0,0.08447,0.005665,0.033647,0.080951,0.083974,0.087464,0.157328
Class_2,50000.0,0.575876,0.031172,0.352091,0.555843,0.578109,0.597282,0.728394
Class_3,50000.0,0.214066,0.026244,0.097866,0.196051,0.210356,0.228117,0.52443
Class_4,50000.0,0.125589,0.025386,0.046897,0.10769,0.12187,0.141223,0.275011
