### Autoencoder model on CTG dataset
- Load Data
- Preprocess Data
- Perform PCA on Data
- Define Autoencoder Model
- Train Model
- Generate predictions for Train and Test Sets
- Perform PCA on Encoded Data
- Evaluate generated data on classification models

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt
import seaborn as sns

# Load Data
ctg_data = pd.read_csv('../CTGRawData.csv', sep=';')

In [2]:
# Preprocess Data
# Separate Features from labels (last column)
y = ctg_data['NSP']
X = ctg_data.drop('NSP',axis=1).iloc[:,:21]
from sklearn.model_selection import train_test_split
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [3]:
# Normalization of data sets
# Data Scaling MinMax
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_norm = X_train
X_test_norm = X_test

X_train_norm = pd.DataFrame(scaler.fit_transform(X_train_norm))
X_test_norm = pd.DataFrame(scaler.transform(X_test_norm))


In [4]:
# Check Shape of data
print(X_train_norm.head())
print(X_train_norm.shape)
print(np.array(X_train_norm)[:5])
# 21 features that we are going to try to reduce.

         0         1         2         3         4         5         6   \
0  0.075472  0.115385  0.000000  0.391304  0.689189  0.088235  0.000000   
1  0.754717  0.000000  0.030142  0.000000  0.810811  0.014706  0.450549   
2  0.320755  0.000000  0.000000  0.000000  0.391892  0.102941  0.252747   
3  0.679245  0.000000  0.000000  0.260870  0.621622  0.029412  0.241758   
4  0.320755  0.000000  0.000000  0.043478  0.648649  0.058824  0.000000   

         7       8    9   ...        11        12        13        14   15  \
0  0.260766  0.0625  0.0  ...  0.344633  0.388889  0.293103  0.166667  0.0   
1  0.165072  0.0000  0.0  ...  0.180791  0.648148  0.284483  0.222222  0.0   
2  0.253589  0.0000  0.0  ...  0.451977  0.083333  0.172414  0.222222  0.0   
3  0.150718  0.0000  0.0  ...  0.056497  0.879630  0.310345  0.000000  0.0   
4  0.248804  0.0000  0.0  ...  0.276836  0.314815  0.120690  0.222222  0.0   

         16        17        18        19   20  
0  0.409449  0.366972  0.330275

In [6]:
import tensorflow as tf
import matplotlib.pyplot as plt
from typing import List

def loss(x: np.ndarray, x_bar: np.ndarray) -> float:
    return tf.losses.mean_squared_error(x, x_bar)

def grad(model: keras.models.Model, inputs: np.ndarray):
    with tf.GradientTape() as tape:
        reconstruction, inputs_reshaped = model(inputs)
        loss_value = loss(inputs_reshaped, reconstruction)
    return loss_value, tape.gradient(loss_value, model.trainable_variables), inputs_reshaped, reconstruction

def train_model(model: keras.models.Model, inputs: pd.DataFrame):
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
    x_train = np.array(inputs)
    num_epochs = 30
    batch_size = 400

    for epoch in range(num_epochs):
        print("Epoch: ", epoch)
        for x in range(0, len(x_train), batch_size):
            x_inp = x_train[x : x + batch_size]
            loss_value, grads, inputs_reshaped, reconstruction = grad(model, x_inp)
            optimizer.apply_gradients(zip(grads, model.trainable_variables))

        print("Step: {},Loss: {}".format(epoch, sum(loss(inputs_reshaped, reconstruction)/batch_size).numpy()))

In [7]:
# Define Autoencoder
class ComposeAutoEncoder(keras.models.Model):
    def __init__(self):
        super(ComposeAutoEncoder, self).__init__()
        
        # this is the size of our encoded representations
        encoding_dim = 16  # 21/16 - we will try to reduce the dimensions to this number.
        # this is our input placeholder
        input_features = layers.Input(shape=(21))

        self.flatten_input = layers.Flatten()(input_features)
        # "Encoder layers"
        self.encoder1 = layers.Dense(64, activation='relu')(self.flatten_input)
        self.encoder2 = layers.Dense(32, activation='relu')(self.encoder1)
        # "Encoder output" - "bottleneck"
        self.bottleneck = layers.Dense(encoding_dim, activation='relu')(self.encoder2)
        # this model maps an input to its encoded representation
        self.encoder = keras.models.Model(input_features, self.bottleneck, name='encoder')

        # "Decoder layers"
        # create a placeholder for an encoded (32-dimensional) input
        self.encoded_input = layers.Input(shape=(encoding_dim,))
        self.decoder1 = layers.Dense(32, activation='relu')(self.encoded_input)
        self.decoder2 = layers.Dense(64, activation='relu')(self.decoder1)
        # "decoded" is the lossy reconstruction of the input
        self.decoded = layers.Dense(21, activation='sigmoid')(self.decoder1)
        # create the decoder model
        self.decoder = keras.models.Model(self.encoded_input, self.decoded, name='decoder')

    
    def call(self, inp):
        # full autoencoder for training
        encoder_model = self.encoder(inp)
        decoder_model = self.decoder(encoder_model)

        return decoder_model, inp

In [8]:
# Instantiate and train model
autoencoder = ComposeAutoEncoder()
train_model(autoencoder, X_train_norm)

Epoch:  0


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

Step: 0,Loss: 0.03126535937190056
Epoch:  1
Step: 1,Loss: 0.030112095177173615
Epoch:  2
Step: 2,Loss: 0.028978293761610985
Epoch:  3
Step: 3,Loss: 0.027355892583727837
Epoch:  4
Step: 4,Loss: 0.02521403133869171
Epoch:  5
Step: 5,Loss: 0.022560540586709976
Epoch:  6
Step: 6,Loss: 0.01949266530573368
Epoch:  7
Step: 7,Loss: 0.016293494030833244
Epoch:  8
Step: 8,Loss: 0.013314832001924515
Epoch:  9
Step: 9,Loss: 0.010895207524299622
Epoch:  10
Step: 10,Loss: 0.009188049472868443
Epoch:  11
Step: 11,Loss: 0.008129659108817577
Epoch:  12
Step: 12,Loss: 0.007542688399553299
Epoch:  13
Step: 13,Loss: 0.0071846977807581425
Epoch:  14
Step: 14,Loss: 0.006950449664145708
Epoch:  15
Step:

In [9]:
predictions = autoencoder.predict(np.array(X_test_norm))



In [15]:
predictions_dataframe = pd.DataFrame(predictions[0])
original_dataframe = pd.DataFrame(predictions[1])
print(predictions_dataframe.iloc[:,:5].describe())
print(original_dataframe.iloc[:,:5].describe())

                0           1           2           3           4
count  426.000000  426.000000  426.000000  426.000000  426.000000
mean     0.532436    0.082475    0.007753    0.146734    0.474328
std      0.057266    0.030605    0.006593    0.029322    0.147643
min      0.398241    0.028837    0.000688    0.087296    0.221656
25%      0.493857    0.060143    0.003193    0.125589    0.355994
50%      0.532846    0.075703    0.005323    0.143432    0.456768
75%      0.574651    0.102335    0.010678    0.164908    0.580504
max      0.646960    0.186834    0.041390    0.248167    0.781621
                0           1           2           3           4
count  426.000000  426.000000  426.000000  426.000000  426.000000
mean     0.530251    0.103467    0.010480    0.148908    0.478810
std      0.184262    0.134833    0.046596    0.120898    0.231382
min      0.075472    0.000000    0.000000    0.000000    0.000000
25%      0.396226    0.000000    0.000000    0.043478    0.270270
50%      0