In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import missingno as msno
import matplotlib.pyplot as plt
import os
import seaborn as sns
%matplotlib inline
import random
def seed_everything(seed=2020):
    random.seed(seed)
    np.random.seed(seed)
seed_everything(42)

In [None]:
df = pd.read_csv('../input/plasmaetch/plasmaetch.csv')
del df['Unnamed: 0']
print("Size of the dataframe: ", df.shape); display(df.head(5))

In [None]:
df_ab = df[df['label']==1]
df_nnorm = df[df['label']!=1]
# df_ab.head()

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler    = MinMaxScaler()
df_values = df_ab.drop(['bid'], axis=1)
column_list = list(df_values.columns) 
df_values = df_values.drop(['label'], axis=1)
df_norm   = scaler.fit_transform(df_values)

In [None]:
df_values.shape

## Training Variational autoencoder (VAE)


In [None]:
from keras.layers import Lambda, Input, Dense
from keras.models import Model
from keras.losses import mse, binary_crossentropy
from keras.utils import plot_model
from keras import backend as K
from keras.callbacks import ModelCheckpoint
from keras.layers import Input, Dense, Lambda, Layer, Add, Multiply
from keras.models import Model, Sequential

import argparse
import os

In [None]:
# network parameters
original_dim= df_values.shape[1]
input_shape = (original_dim, )
intermediate_dim = int(original_dim/2)
batch_size = 128
latent_dim = 18
epochs     = 100
epsilon_std = 1.0

In [None]:
class KLDivergenceLayer(Layer):

    """ Identity transform layer that adds KL divergence
    to the final model loss.
    """

    def __init__(self, *args, **kwargs):
        self.is_placeholder = True
        super(KLDivergenceLayer, self).__init__(*args, **kwargs)

    def call(self, inputs):

        mu, log_var = inputs

        kl_batch = - .5 * K.sum(1 + log_var -
                                K.square(mu) -
                                K.exp(log_var), axis=-1)

        self.add_loss(K.mean(kl_batch), inputs=inputs)

        return inputs

In [None]:
# VAE Architecture
# * original_dim - Original Input Dimension
# * intermediate_dim - Hidden Layer Dimension
# * latent_dim - Latent Dimension
from keras.layers.normalization import BatchNormalization

def vae_arc(original_dim, intermediate_dim, latent_dim):
    # Encode
    x = Input(shape=(original_dim,))
    h = Dense(intermediate_dim, activation='relu')(x)
    # h = BatchNormalization()(h)
    # Decode
    decoder = Sequential([
        Dense(intermediate_dim, input_dim=latent_dim, activation='relu'),
        Dense(original_dim, activation='sigmoid')
    ])
    z_mu = Dense(latent_dim)(h)
    z_log_var = Dense(latent_dim)(h)

    z_mu, z_log_var = KLDivergenceLayer()([z_mu, z_log_var])
    z_sigma = Lambda(lambda t: K.exp(.5*t))(z_log_var)

    eps = Input(tensor=K.random_normal(stddev=epsilon_std,
                                       shape=(K.shape(x)[0], latent_dim)))
    z_eps = Multiply()([z_sigma, eps])
    z = Add()([z_mu, z_eps])

    x_pred = decoder(z)
    
    return x, eps, z_mu, x_pred

In [None]:
def nll(y_true, y_pred):
    """ Negative log likelihood (Bernoulli). """

    # keras.losses.binary_crossentropy gives the mean
    # over the last axis. we require the sum
    return K.sum(K.binary_crossentropy(y_true, y_pred), axis=-1)

In [None]:
x, eps, z_mu, x_pred = vae_arc(original_dim, intermediate_dim, latent_dim)
vae            = Model(inputs=[x, eps], outputs=x_pred)
vae.compile(optimizer='adam', loss=nll)

In [None]:
vae.summary()

![](https://tiao.io/post/tutorial-on-variational-autoencoders-with-a-concise-keras-implementation/vae_full.svg)

In [None]:
from sklearn.model_selection import train_test_split

# 
X_train, X_test, y_train, y_test = train_test_split(df_norm, df_norm, 
                                                    test_size=0.4, random_state=42)

In [None]:
filepath   ="weights.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [None]:
# train
hist = vae.fit(X_train, X_train,
        epochs=epochs,
        batch_size=batch_size,
        callbacks=callbacks_list,
        validation_data=(X_test, X_test))

In [None]:
def plt_hist(hist):
    # summarize history for loss
    plt.plot(hist.history['loss'])
    plt.plot(hist.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')

In [None]:
plt_hist(hist)

In [None]:
# Predict Embedding values
encoder = Model(x, z_mu)
z_df    = encoder.predict(df_norm, batch_size=batch_size)

In [None]:
df_vae = pd.DataFrame(z_df)
df_vae['label'] = 1


In [None]:
df_vae.columns = column_list
df_vae.head(10)

In [None]:
df_nvalues = df.drop(['bid','label'], axis=1)
scaled_nnorm   = scaler.fit_transform(df_nvalues)
df_scaler_nnorm = pd.DataFrame(scaled_nnorm, index=df_nvalues.index, columns=df_nvalues.columns)


In [None]:
df_scaler_nnorm['label'] = df.label
# df_scaler_nnorm.head()

In [None]:
df_train = pd.concat([df_vae, df_scaler_nnorm], ignore_index=True)
# df_train.head()

In [None]:
df_train.shape

In [None]:
X = df_train.drop(['label'], axis=1)
y = df_train['label']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,roc_curve,auc
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import matthews_corrcoef, confusion_matrix,precision_recall_curve,auc,f1_score,roc_auc_score,roc_curve,recall_score,classification_report 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.cross_validation import KFold, cross_val_score


In [None]:

def model(algorithm,dtrain_x,dtrain_y,dtest_x,dtest_y):
    
    print ("MODEL - OUTPUT")
    print ("*****************************************************************************************")
    algorithm.fit(dtrain_x,dtrain_y)
    predictions = algorithm.predict(dtest_x)
    
    print (algorithm)
    print ("\naccuracy_score :",accuracy_score(dtest_y,predictions))
    print ("\nrecall score:\n",(recall_score(dtest_y,predictions)))
    print ("\nf1 score:\n",(f1_score(dtest_y,predictions)))
#     print ("\nclassification report :\n",(classification_report(dtest_y,predictions)))
    print ("\nmatthews_corrcoef:\n", (matthews_corrcoef(dtest_y, predictions)))
    #cross validation
    
    # Graph
    plt.figure(figsize=(13,10))
    plt.subplot(221)
    sns.heatmap(confusion_matrix(dtest_y,predictions),annot=True,fmt = "d",linecolor="k",linewidths=3)
    plt.title("CONFUSION MATRIX",fontsize=20)
    
    predicting_probabilites = algorithm.predict_proba(dtest_x)[:,1]
    fpr,tpr,thresholds = roc_curve(dtest_y,predicting_probabilites)
    plt.subplot(222)
    plt.plot(fpr,tpr,label = ("Area_under the curve :",auc(fpr,tpr)),color = "r")
    plt.plot([1,0],[1,0],linestyle = "dashed",color ="k")
    plt.legend(loc = "best")
    plt.title("ROC - CURVE & AREA UNDER CURVE",fontsize=20)


In [None]:

clf = DecisionTreeClassifier()
model(clf ,X_train,y_train,X_test,y_test)

In [None]:
svc=SVC(probability=True) 

model(svc ,X_train,y_train,X_test,y_test)