# Autoencoder

In this notebook I show how to use an autoencoder for dimension reduction. The compression in the middle has an additional noise reducing effect. This happens because the decoder takes only the systematic components of the relationships present in the data set and reprocesses them. The noise remains in the bottleneck. Note that denoising autoencoders are usually trained by adding noise rather than by compression, so here it is assumed that the dataset is already noisy.

In [None]:
a=["affe","huhn"]
b=["atte","ratte","huhn"]

[i in a for i in b]

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split as tts
from sklearn.linear_model import LogisticRegression as logit
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.metrics import roc_auc_score
import keras as ks
import tensorflow as tf
from scipy.special import erfinv
import plotly.express as px
import seaborn as sns
import plotly.figure_factory as ff
from plotly.offline import iplot
import math

import warnings
warnings.filterwarnings("ignore")

In [None]:
compresion = 3
eps = 100
bs = 2048
leRa = 0.02
dec = 0.0001

randRatioViz = 0.1

"Learning rate and decay ok?: " + str(leRa - dec * eps > 0)

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train = pd.read_csv("../input/tabular-playground-series-mar-2021/train.csv")
train_id = train.id
test = pd.read_csv("../input/tabular-playground-series-mar-2021/test.csv")
test_id = test.id

In [None]:
Xx = pd.concat([train.drop("target", axis = 1), test])
Xx = Xx.set_index("id")
#Xx.dtypes

In [None]:
Xx.head()

In [None]:
catVars = [c for c in Xx.columns if "cat" in c]
contVars = [c for c in Xx.columns if "cont" in c]

In [None]:
le = preprocessing.LabelEncoder()
for c in catVars:
    Xx[f"{c}"] = le.fit_transform(Xx[f"{c}"])

Let´s check some correlations:

In [None]:
r = list(np.random.random_sample(train.shape[0]) <= randRatioViz)

df = train.loc[r,contVars]
df["target"] = train.loc[r,["target"]]

sns.set_theme(style="ticks")

sns.pairplot(df, 
             hue="target",
             palette ="viridis",
             kind="hist",
             height=2,
             diag_kind="kde",
             corner=True
             )

#fig = ff.create_scatterplotmatrix(df, 
#                                  diag='box', 
#                                  index='target',
#                                  colormap='Cividis',
#                                  colormap_type='cat',
#                                  height=900, width=900
#                                  )
#iplot(fig)

Rank Gauss

In [None]:
def rg(df, e, Vars):
    for i in df.loc[:,Vars]:
        r = df[i].rank()
        Range = (r/r.max()-0.5)*2
        Range = np.clip(Range, a_max = 1-e, a_min = -1+e)
        rg = erfinv(Range)
        df[i] = rg * 2**0.5
    return df

In [None]:
Xx_train = rg(Xx, 0.000001, contVars)

Define the encoder and the decoder separately:

In [None]:
#inputDims = len(Xx.cat0.unique())
def autoencoder(DataSet, comp):
    
    """This function returns the encoder, the autoencoder, and the names of the embeddings 
    the output-layer as input for the decoder, the inputs, the outputs and the names ot all inputs"""
    
    inputs = []
    outputs = []
    names = []

    for c in catVars:

        inputDims = len(DataSet[f"{c}"].unique())
        embedDim = min([math.ceil(inputDims / 10), 10])

        INPUT = ks.layers.Input(shape=(1), name=c + "_emb")
        OUTPUT = ks.layers.Embedding(inputDims + 1, embedDim)(INPUT)
        OUTPUT = ks.layers.Reshape(target_shape=(embedDim, ))(OUTPUT)

        inputs.append(INPUT)
        outputs.append(OUTPUT)
        names.append(c + "_emb")

    contNum = len(DataSet.columns) - len(catVars) 

    INPUT = ks.layers.Input(shape=(contNum,), name="Vars")

    outputs.append(INPUT)
    inputs.append(INPUT)

    CONCAT = ks.layers.Concatenate()(outputs)

    OUT = ks.layers.Dropout(0.1)(CONCAT)
    OUT = ks.layers.BatchNormalization()(OUT)
    OUT = ks.layers.Dense(150, activation='relu')(OUT)

    OUT = ks.layers.Dropout(0.1)(OUT)
    OUT = ks.layers.BatchNormalization()(OUT)
    OUT = ks.layers.Dense(15, activation='relu')(OUT)

    OUT = ks.layers.Dropout(0.1)(OUT)
    OUT = ks.layers.BatchNormalization()(OUT)
    OUT = ks.layers.Dense(comp, activation='linear')(OUT)

    encoder = ks.Model(inputs=inputs, outputs=OUT)

    OUT = ks.layers.Dropout(0.1)(OUT)
    OUT = ks.layers.BatchNormalization()(OUT)
    OUT = ks.layers.Dense(15, activation='relu')(OUT)

    OUT = ks.layers.Dropout(0.1)(OUT)
    OUT = ks.layers.BatchNormalization()(OUT)
    OUT = ks.layers.Dense(150, activation='relu')(OUT)

    OUT = ks.layers.Dropout(0.1)(OUT)
    OUT = ks.layers.BatchNormalization()(OUT)
    OUT = ks.layers.Dense(30-len(catVars), activation='linear')(OUT)

    AE = ks.Model(inputs=inputs, outputs=OUT)
    
    return encoder, AE, names

In [None]:
help(autoencoder)

Combining

In [None]:
ENCODER, AUTOENCODER, names = autoencoder(Xx,compresion)

ks.utils.plot_model(AUTOENCODER, 
                    show_shapes=True, 
                    show_layer_names=True
                    )

In [None]:
def rmse(y_pred, y_true):
    y_pred = tf.cast(y_pred, dtype="float32")
    y_true = tf.cast(y_true, dtype="float32")
    r = tf.sqrt(tf.keras.backend.mean(tf.square(y_pred - y_true)))
    return r

In [None]:
stop = ks.callbacks.EarlyStopping(monitor='AUC', min_delta=0.000001, patience=10, mode='max')

In [None]:
optimizer = ks.optimizers.Adam(lr=leRa, decay=dec)
AUTOENCODER.compile(optimizer = optimizer, loss = rmse)

In [None]:
X_Train = {names[c]: Xx_train.iloc[:,c] for c in range(len(catVars))}
X_Train.update({"Vars": Xx_train.drop(catVars, axis=1)})

In [None]:
history = ks.callbacks.History()

AUTOENCODER.fit(X_Train, 
                Xx.drop(catVars, axis=1), 
                epochs = eps, 
                batch_size = bs, 
                shuffle = False,
                callbacks=[history]
               )

#print(history.history)

In [None]:
Denoised = AUTOENCODER.predict(
   x=X_Train, 
   workers = 1, 
   use_multiprocessing = True
)

Denoised = pd.DataFrame(Denoised, columns=contVars)

Denoised = pd.concat([Xx.loc[:,catVars], Denoised], axis=1)

#for c in catVars:
#    Denoised[f"{c}"] = Xx[f"{c}"]
    
Denoised.head()

In [None]:
#c=1
X_Compress = {names[c]: Xx_train.iloc[train_id,c] for c in range(len(catVars))}
X_Compress.update({"Vars": Xx_train.iloc[train_id,:].drop(catVars, axis=1)})

In [None]:
Compressed = ENCODER.predict(
   x=X_Compress, 
   workers = 1, 
   use_multiprocessing = True
)

Compressed = pd.DataFrame(Compressed, columns=["dim_{0}".format(i) for i in range(Compressed.shape[1])])
Compressed["target"] = train.target.astype("category")
Compressed.head()

Compressed Representation

In [None]:
r = list(np.random.random_sample(Compressed.shape[0]) <= randRatioViz)

Compressed = Compressed.loc[r,:]

fig = px.scatter_3d(
     Compressed, 
     x='dim_0', 
     y='dim_1', 
     z='dim_2',
     color='target',
     hover_data={'dim_0': False, 
                 'dim_1': False,
                 'dim_2': False,
                 'target': True
             },
     opacity=1,
     color_discrete_sequence=px.colors.qualitative.Antique,
     title="Compressed Representation",
     template="simple_white"
     )

fig.update_traces(marker=dict(size=6,
                              line=dict(width=1,
                                        color='grey')),
                  selector=dict(mode='markers'))

fig.update_layout(margin=dict(l=0, r=0, b=0, t=0),
                 scene=dict(bgcolor='white'))

fig.show()

Using the denoised data:

In [None]:
for c in catVars:
    Denoised[f"{c}"] = Denoised[f"{c}"].astype("category")

tr = Denoised.loc[train_id,:]
print(tr.shape)
te = Denoised.loc[test_id,:]
te.shape

In [None]:
cv_size = 0.2
X_train, X_test, y_train, y_test = tts(tr, train.target, test_size=cv_size, random_state=42)

y_train.describe()

In [None]:
clf = RF(n_estimators=80, 
         min_samples_leaf=5,
         max_depth=20, 
         min_samples_split=5, 
         random_state=0,
         n_jobs=-1
         )

clf.fit(X_train, y_train)
#clf.predict(X_test) 

print("AUC is: " + str(roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])))
print("Accuracy is: " + str(clf.score(X_test, y_test)))

In [None]:
submission = pd.read_csv("../input/tabular-playground-series-mar-2021/sample_submission.csv")

submission['target'] = clf.predict_proba(te)
submission['target'].describe()

submission.to_csv("submission.csv", index=False)