### Autoencoder model on CTG dataset
- Load Data
- Preprocess Data
- Define Autoencoder Model
- Train Model
- Generate predictions for Train and Test Sets

- Evaluate generated data on classification models

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt
import seaborn as sns

# Load Data
ctg_data = pd.read_csv('../CTGRawData.csv', sep=';')

In [2]:
# Preprocess Data
# Separate Features from labels (last column)
y = ctg_data['NSP']
X = ctg_data.drop('NSP',axis=1).iloc[:,:21]
from sklearn.model_selection import train_test_split
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [3]:
# Normalization of data sets
# Data Scaling MinMax
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_norm = X_train
X_test_norm = X_test

X_train_norm = pd.DataFrame(scaler.fit_transform(X_train_norm))
X_test_norm = pd.DataFrame(scaler.transform(X_test_norm))


In [17]:
# Check Shape of data
print(X_train_norm.head())
print(X_train_norm.shape)
print(np.array(X_train_norm)[:5])
# 21 features that we are going to try to reduce.

         0         1         2         3         4         5         6   \
0  0.592593  0.000000  0.000000  0.086957  0.360000  0.088235  0.109890   
1  0.685185  0.000000  0.000000  0.130435  0.733333  0.014706  0.065934   
2  0.759259  0.192308  0.000000  0.173913  0.320000  0.147059  0.000000   
3  0.851852  0.000000  0.000000  0.086957  0.600000  0.044118  0.472527   
4  0.407407  0.038462  0.003546  0.043478  0.026667  0.367647  0.000000   

         7      8    9   ...        11        12        13      14   15  \
0  0.179487  0.000  0.0  ...  0.112994  0.743119  0.275862  0.1250  0.0   
1  0.189349  0.000  0.0  ...  0.118644  0.770642  0.310345  0.0625  0.0   
2  0.094675  0.000  0.0  ...  0.220339  0.752294  0.448276  0.1250  0.0   
3  0.171598  0.000  0.0  ...  0.214689  0.669725  0.362069  0.2500  0.0   
4  0.000000  0.125  0.0  ...  0.485876  0.155963  0.293103  0.2500  0.0   

         16        17        18        19   20  
0  0.650794  0.623853  0.605505  0.003717  0.5  


In [41]:
import tensorflow as tf
import matplotlib.pyplot as plt
from typing import List

def loss(x: np.ndarray, x_bar: np.ndarray) -> float:
    return tf.losses.mean_squared_error(x, x_bar)

def grad(model: keras.models.Model, inputs: np.ndarray):
    with tf.GradientTape() as tape:
        reconstruction, inputs_reshaped = model(inputs)
        loss_value = loss(inputs_reshaped, reconstruction)
    return loss_value, tape.gradient(loss_value, model.trainable_variables), inputs_reshaped, reconstruction

def train_model(model: keras.models.Model, inputs: pd.DataFrame):
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
    x_train = np.array(inputs)
    num_epochs = 30
    batch_size = 400

    for epoch in range(num_epochs):
        print("Epoch: ", epoch)
        for x in range(0, len(x_train), batch_size):
            x_inp = x_train[x : x + batch_size]
            loss_value, grads, inputs_reshaped, reconstruction = grad(model, x_inp)
            optimizer.apply_gradients(zip(grads, model.trainable_variables))

        print("Step: {},Loss: {}".format(epoch, sum(loss(inputs_reshaped, reconstruction)/batch_size).numpy()))

In [14]:
# Define Autoencoder
class ComposeAutoEncoder(keras.models.Model):
    def __init__(self):
        super(ComposeAutoEncoder, self).__init__()
        
        # this is the size of our encoded representations
        encoding_dim = 16  # 21/16 - we will try to reduce the dimensions to this number.
        # this is our input placeholder
        input_features = layers.Input(shape=(21))

        self.flatten_input = layers.Flatten()(input_features)
        # "Encoder layers"
        self.encoder1 = layers.Dense(64, activation='relu')(self.flatten_input)
        self.encoder2 = layers.Dense(32, activation='relu')(self.encoder1)
        # "Encoder output" - "bottleneck"
        self.bottleneck = layers.Dense(encoding_dim, activation='relu')(self.encoder2)
        # this model maps an input to its encoded representation
        self.encoder = keras.models.Model(input_features, self.bottleneck, name='encoder')

        # "Decoder layers"
        # create a placeholder for an encoded (32-dimensional) input
        self.encoded_input = layers.Input(shape=(encoding_dim,))
        self.decoder1 = layers.Dense(32, activation='relu')(self.encoded_input)
        self.decoder2 = layers.Dense(64, activation='relu')(self.decoder1)
        # "decoded" is the lossy reconstruction of the input
        self.decoded = layers.Dense(21, activation='sigmoid')(self.decoder1)
        # create the decoder model
        self.decoder = keras.models.Model(self.encoded_input, self.decoded, name='decoder')

    
    def call(self, inp):
        # full autoencoder for training
        encoder_model = self.encoder(inp)
        decoder_model = self.decoder(encoder_model)

        return decoder_model, inp

In [42]:
# Instantiate and train model
autoencoder = ComposeAutoEncoder()
train_model(autoencoder, X_train_norm)

Epoch:  0


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

Step: 0,Loss: 0.03069094754755497
Epoch:  1
Step: 1,Loss: 0.02953142300248146
Epoch:  2
Step: 2,Loss: 0.027845604345202446
Epoch:  3
Step: 3,Loss: 0.025549378246068954
Epoch:  4
Step: 4,Loss: 0.022586962208151817
Epoch:  5
Step: 5,Loss: 0.019061794504523277
Epoch:  6
Step: 6,Loss: 0.01549462042748928
Epoch:  7
Step: 7,Loss: 0.0125578036531806
Epoch:  8
Step: 8,Loss: 0.01051331590861082
Epoch:  9
Step: 9,Loss: 0.009121145121753216
Epoch:  10
Step: 10,Loss: 0.008163249120116234
Epoch:  11
Step: 11,Loss: 0.007515279110521078
Epoch:  12
Step: 12,Loss: 0.006956912111490965
Epoch:  13
Step: 13,Loss: 0.006487374193966389
Epoch:  14
Step: 14,Loss: 0.006091434508562088
Epoch:  15
Step: 15,

In [43]:
predictions = autoencoder.predict(np.array(X_test_norm))



In [44]:
predictions[0][:1]
model_predictions = predictions[0]
original = predictions[1]

In [40]:
np.sum(np.square(model_predictions[0] - original[0]))/21

0.08069318816775367

In [45]:
loss(predictions[0],predictions[1])

<tf.Tensor: id=40070, shape=(426,), dtype=float32, numpy=
array([0.01202125, 0.00751258, 0.01543015, 0.00894597, 0.00999405,
       0.05749016, 0.00385193, 0.00458007, 0.01495536, 0.00742398,
       0.01497885, 0.00765213, 0.01546528, 0.03273059, 0.01949633,
       0.01722872, 0.01925274, 0.0111112 , 0.00883612, 0.01063619,
       0.01277492, 0.03945161, 0.00928657, 0.01130201, 0.01307223,
       0.03189211, 0.0083391 , 0.02406597, 0.01008516, 0.01624949,
       0.01727198, 0.01125048, 0.01247197, 0.0170657 , 0.00824525,
       0.00600527, 0.00930242, 0.0048705 , 0.01265698, 0.00996941,
       0.01366588, 0.00267243, 0.00808315, 0.00613591, 0.01316005,
       0.0136971 , 0.01684471, 0.02621414, 0.01796221, 0.04345824,
       0.00853656, 0.00629912, 0.03311878, 0.01021268, 0.03337889,
       0.01716169, 0.00940685, 0.01598976, 0.0111738 , 0.01070078,
       0.03457233, 0.02495486, 0.01119348, 0.01938914, 0.01655806,
       0.01310706, 0.00897702, 0.00962229, 0.01397998, 0.01762207,
    

In [46]:
original[0]

array([0.46296296, 0.        , 0.        , 0.17391305, 0.22666667,
       0.11764706, 0.        , 0.25641027, 0.        , 0.        ,
       0.        , 0.15819208, 0.6146789 , 0.22413793, 0.        ,
       0.        , 0.54761904, 0.5504587 , 0.5229358 , 0.01858736,
       0.5       ], dtype=float32)

In [47]:
model_predictions[0]

array([0.5493144 , 0.10448322, 0.0215407 , 0.15353984, 0.52574575,
       0.10519052, 0.16590884, 0.18509427, 0.09292799, 0.01884124,
       0.03267112, 0.25151324, 0.5873399 , 0.31799772, 0.21157381,
       0.03207272, 0.62124264, 0.6162955 , 0.6006619 , 0.05946442,
       0.6392676 ], dtype=float32)