# VARIATIONAL AUTOENCODER MODEL (VAE)
This model includes Feature Selection (FS) technique "Recursive Feature Elimination (RFE)" and Class Imbalance algorithm "Synthetic Minority Oversampling Technique (SMOTE)" to further reduce data complexity & enhance computational efficiency.

INSTALL MODULES

In [None]:
# Supports nested progress bars
!pip install keras_tqdm



IMPORT LIBRARIES

In [None]:
import os # Standard library within Python 3
import numpy as np # Lib for multi-dimensional arrays and matrices handling 
import pandas as pd # Lib for data manipulation and analysis

%matplotlib inline
import matplotlib.pyplot as plt # Lib for interactive plots
plt.style.use('seaborn-white') # Sets theme of visualization (seaborn-ticks / whitegrid) are similar to white
import seaborn as sns # Matplotlib based lib - better interface for drawing attractive and informative statistical graphics
sns.set_palette(['#FC4B60','#06B1F0'])
random_seed = 63445 

import warnings # Lib for warning issue handling
warnings.filterwarnings('ignore') # Ignores all irrelevant warnings
from collections import Counter

from sklearn import svm # SVM model for RFE Feature Selection
from sklearn.preprocessing import MinMaxScaler # Feature scaling
from sklearn.model_selection import train_test_split # Splitting data into training and test set
from imblearn.over_sampling import SMOTE # Oversample data using SMOTE algorithm

# Framework / Platform for building ML models
import tensorflow as tf
from tensorflow.python import keras
from tensorflow.keras import regularizers
from tensorflow.keras import layers
from tensorflow.keras.layers import Input, Dense, Lambda, Layer, Activation
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras import backend as K
from keras import metrics, optimizers
from keras.callbacks import Callback
# tf.disable_v2_behavior()

from keras_tqdm import TQDMNotebookCallback # TQDM progress bar

print("Keras Version:",keras.__version__)
print("TensorFlow Version:",tf.__version__)

Keras Version: 2.4.0
TensorFlow Version: 2.3.0


NOTE : TensorFlow's eager execution is an imperative programming environment that evaluates operations immediately, without building graphs: operations return concrete values instead of constructing a computational graph to run later.

In [None]:
# Disabling eager execution
tf.compat.v1.disable_eager_execution()

TENSORFLOW VERSION CONTROL

In [None]:
tf.executing_eagerly() # True shows that eager execution is enabled

False

SEABORN STYLE DESCRIPTION

In [None]:
sns.set(style="white", color_codes=True)
sns.set_context("paper", rc={"font.size":14,"axes.titlesize":15,"axes.labelsize":20,
                             'xtick.labelsize':14, 'ytick.labelsize':14})

VAE MODEL FUNCTION

In [None]:
# Function for reparameterization trick to make model differentiable
def sampling(args):
    
    #import tensorflow as tf
    # Function with args required for Keras Lambda function
    z_mean, z_log_var = args

    # Draw epsilon of the same shape from a standard normal distribution
    epsilon = K.random_normal(shape=tf.shape(z_mean), mean=0.,
                              stddev=epsilon_std)
    
    # The latent vector is non-deterministic and differentiable
    # in respect to z_mean and z_log_var
    z = z_mean + K.exp(z_log_var / 2) * epsilon
    return z


class CustomVariationalLayer(Layer):
    """
    Define a custom layer that learns and performs the training
    This function is borrowed from:
    https://github.com/fchollet/keras/blob/master/examples/variational_autoencoder.py
    """
    def __init__(self, **kwargs):
        # https://keras.io/layers/writing-your-own-keras-layers/
        self.is_placeholder = True
        super(CustomVariationalLayer, self).__init__(**kwargs)

    def vae_loss(self, x_input, x_decoded):
        reconstruction_loss = original_dim * metrics.binary_crossentropy(x_input, x_decoded)
        kl_loss = - 0.5 * K.sum(1 + z_log_var_encoded - K.square(z_mean_encoded) - 
                                K.exp(z_log_var_encoded), axis=-1)
        return K.mean(reconstruction_loss + (K.get_value(beta) * kl_loss))
    
    def call(self, inputs):
        x = inputs[0]
        x_decoded = inputs[1]
        loss = self.vae_loss(x, x_decoded)
        self.add_loss(loss, inputs=inputs)
        # We won't actually use the output.
        return x

class WarmUpCallback(Callback):
    def __init__(self, beta, kappa):
        self.beta = beta
        self.kappa = kappa
    # Behavior on each epoch
    def on_epoch_end(self, epoch, logs={}):
        if K.get_value(self.beta) <= 1:
            K.set_value(self.beta, K.get_value(self.beta) + self.kappa)


MOUNT DRIVE & SET DATA PATH

In [None]:
from google.colab import drive # Link notebook with google drive
drive.mount('/content/gdrive/') # To retrieve data from our personal Gdrive (can remove this line if we are about to access data from pc)
#Define path
data_path = '/content/gdrive/My Drive/PSM2 VIVEK/LUSC Dataset/' #Change this path accordingly

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


CHECKING FOR MACHINE RUNTIME TYPE

In [None]:
#Checking whether your machine running only on CPU or with GPU
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())
CUDA_VISIBLE_DEVICES = 1

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 8453986134513322460
, name: "/device:XLA_CPU:0"
device_type: "XLA_CPU"
memory_limit: 17179869184
locality {
}
incarnation: 16341981410105669302
physical_device_desc: "device: XLA_CPU device"
, name: "/device:XLA_GPU:0"
device_type: "XLA_GPU"
memory_limit: 17179869184
locality {
}
incarnation: 8504566838642120967
physical_device_desc: "device: XLA_GPU device"
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 11146722048
locality {
  bus_id: 1
  links {
  }
}
incarnation: 8085203975290687077
physical_device_desc: "device: 0, name: Tesla K80, pci bus id: 0000:00:04.0, compute capability: 3.7"
]


IMPORT DATASET

In [None]:
#Create DataFrame
print("Reading Multi-omics Data")
df = pd.read_csv(data_path + "Complete_MultiOmics.csv", delimiter=",", index_col=0)
nFeatures = len(df.columns) - 2 #First and last column does not hold data to be processed
print("Number of features :", nFeatures)
print("Size of Dataset :", df.shape)

Reading Multi-omics Data
Number of features : 18663
Size of Dataset : (344, 18665)


DISPLAY MULTI-OMICS DATA

In [None]:
print("Multi-Omics data imported successfully")
df

Multi-Omics data imported successfully


Unnamed: 0_level_0,?|10357,?|10431,?|155060,?|57714,?|653553,?|8225,A1BG|1,A2LD1|87769,A2M|2,A4GALT|53947,AAAS|8086,AACS|65985,AADAT|51166,AAGAB|79719,AAK1|22848,AAMP|14,AARS2|57505,AARSD1|80755,AARS|16,AASDHPPT|60496,AASDH|132949,AASS|10157,AATF|26574,AATK|9625,ABAT|18,ABCA11P|79963,ABCA1|19,ABCA2|20,ABCA3|21,ABCA5|23461,ABCA6|23460,ABCA7|10347,ABCA9|10350,ABCB10|23456,ABCB1|5243,ABCB6|10058,ABCB7|22,ABCB8|11194,ABCB9|23457,ABCC10|89845,...,rs2208123,rs2235751,rs2385226,rs2468330,rs2521373,rs264581,rs2804694,rs2857639,rs2959823,rs348937,rs3818562,rs3936238,rs4331560,rs472920,rs4742386,rs5926356,rs5931272,rs5936512,rs5987737,rs6426327,rs6471533,rs654498,rs6546473,rs6626309,rs6982811,rs6991394,rs715359,rs739259,rs7660805,rs7746156,rs798149,rs845016,rs877309,rs9292570,rs9363764,rs939290,rs951295,rs966367,rs9839873,Class
PatientID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
TCGA.18.5592.01,236.8295,1141.0830,88.0285,747.6943,409.8256,498.2193,12.0537,111.0547,2676.6396,2246.0049,833.5312,1965.1173,246.9181,1057.0724,1256.8715,3962.7431,569.7891,693.2700,3203.7257,930.3260,163.6380,164.3686,1859.9215,64.2864,21.9158,51.7432,1202.8089,1341.2474,62.8253,487.2614,36.1611,341.5213,14.9758,1291.2063,26.6642,2960.3360,550.8173,702.4016,191.1789,627.8879,...,0.445828,0.498636,0.973635,0.974207,0.928097,0.460344,0.952743,0.609471,0.906139,0.810115,0.163124,0.154739,0.022984,0.544173,0.832685,0.098743,0.976876,0.943812,0.952602,0.020949,0.519868,0.841502,0.016631,0.923785,0.179910,0.497850,0.419557,0.465892,0.850265,0.461360,0.012726,0.481794,0.239229,0.087837,0.536541,0.022941,0.540579,0.026454,0.045728,1
TCGA.18.5595.01,245.4566,999.4328,68.3494,671.5825,821.0437,423.4260,26.6591,45.0227,7857.6064,193.7039,961.1458,2001.1344,72.8871,889.1095,896.4833,2716.3925,537.1384,513.0459,6287.2944,502.8361,279.3534,346.5683,1592.4560,33.1849,188.3154,101.7754,640.6693,1015.8820,1530.9132,307.7141,14.4640,295.2354,7.6574,442.9949,4.8213,3476.2365,483.5508,228.5876,150.0851,374.3619,...,0.852261,0.516386,0.334989,0.473545,0.055859,0.954779,0.447732,0.615397,0.859717,0.455601,0.959690,0.399521,0.430007,0.691078,0.528738,0.889545,0.015282,0.925424,0.049640,0.037420,0.953146,0.358960,0.017422,0.911405,0.379057,0.500063,0.976334,0.552217,0.036871,0.546697,0.013850,0.396258,0.956542,0.482447,0.566233,0.966587,0.532004,0.510593,0.933625,1
TCGA.21.5782.01,308.5506,923.5955,146.0674,212.3596,1083.1461,767.4157,155.0562,198.5281,9403.2921,1151.6854,1142.6966,1384.2697,39.3258,1355.0562,1052.8090,3691.0112,988.5169,613.4831,7869.6629,719.1011,321.3483,278.6517,2089.8876,48.3146,86.5169,117.4045,1119.1011,861.7978,574.1573,234.8315,51.6854,525.8427,32.5843,277.5281,17.9775,2008.1910,302.2472,831.4607,292.9326,1108.9888,...,0.452553,0.036205,0.967153,0.471226,0.032291,0.438903,0.574025,0.963228,0.610111,0.402136,0.929609,0.030298,0.948934,0.549283,0.696018,0.469880,0.443723,0.696970,0.491719,0.042476,0.956615,0.916568,0.017187,0.513279,0.681604,0.329247,0.511011,0.524875,0.046657,0.028637,0.967372,0.933384,0.544448,0.298030,0.932454,0.030718,0.972964,0.466552,0.930599,1
TCGA.21.5783.01,192.7190,526.9117,150.0900,906.6365,748.4178,607.0371,144.2199,47.5353,5312.0566,340.8233,969.9239,1179.2371,190.1527,1441.6768,1964.2339,2394.7628,663.5749,453.7537,3939.4995,681.6466,291.4707,593.3926,903.1528,115.8335,306.2765,15.7551,1544.1125,1215.5258,372.7574,236.3119,28.7406,138.7679,10.7415,265.9235,21.4829,640.6666,548.9752,1006.2126,238.8841,418.3359,...,0.867139,0.068455,0.029902,0.383294,0.934068,0.032984,0.968455,0.069826,0.936634,0.238830,0.758543,0.971294,0.025164,0.500824,0.960496,0.115242,0.019951,0.039448,0.034378,0.875392,0.510459,0.920425,0.024096,0.064971,0.139338,0.068760,0.971782,0.033474,0.629115,0.032588,0.967869,0.583177,0.020746,0.357359,0.449217,0.963890,0.455323,0.529334,0.936669,1
TCGA.21.5784.01,161.6057,1196.5056,51.7527,212.7613,3455.4904,557.7795,144.1336,106.4647,20321.8932,1511.4453,470.6403,1242.9503,26.9822,1051.8633,543.6249,3648.3468,355.1697,462.2360,2652.2172,872.7192,205.2416,335.7293,1096.5388,65.9073,168.9705,100.3118,770.5407,426.8495,3009.1784,260.9753,60.5994,516.6427,23.0012,389.2514,37.5981,1725.8653,259.6483,669.2469,101.2098,877.1425,...,0.878958,0.046282,0.415254,0.025814,0.426774,0.029040,0.388852,0.038840,0.936436,0.524348,0.462386,0.443197,0.459078,0.966929,0.956077,0.942897,0.013271,0.489159,0.027884,0.024321,0.521091,0.088205,0.462810,0.449937,0.029276,0.051736,0.503357,0.441174,0.956625,0.019335,0.393124,0.503706,0.018432,0.639100,0.030030,0.590748,0.522493,0.031209,0.955941,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA.O2.A52S.01,176.7177,1188.3278,226.8212,1100.5795,172.1854,452.8146,133.1664,75.6954,3629.9421,382.8642,1100.1656,707.7815,125.8278,826.9868,1319.9503,2686.6722,516.5025,791.3907,10120.8609,1137.0033,240.0662,200.7450,3512.0033,258.2781,50.9106,132.8146,2937.8932,1688.7417,482.6159,334.4371,26.0762,227.2351,20.6954,658.5265,34.3543,2513.3320,409.3543,670.9437,146.8212,419.2881,...,0.880683,0.045593,0.030224,0.963950,0.924719,0.041458,0.950275,0.954434,0.928485,0.301883,0.958972,0.026601,0.464005,0.038441,0.060987,0.112787,0.975272,0.046932,0.025996,0.892462,0.073984,0.275673,0.024864,0.941064,0.058028,0.066683,0.452329,0.040967,0.283380,0.749446,0.024437,0.084381,0.030087,0.952404,0.569576,0.043652,0.964282,0.367595,0.931168,1
TCGA.O2.A52V.01,188.7215,1248.0303,147.4945,510.5578,1253.0728,811.4298,148.8055,86.9041,5406.0132,1188.7803,829.0787,731.5895,61.3510,1471.5831,1637.5670,3304.9690,452.1315,614.3502,2855.3419,795.0415,162.6221,1217.3548,1477.8863,41.1808,476.5206,66.1330,1495.5016,750.9192,222.7125,619.3928,66.3935,630.3183,27.7340,758.4830,11.7659,676.2517,329.4464,902.1956,113.3606,846.3074,...,0.547567,0.032090,0.031295,0.443925,0.922467,0.959101,0.025450,0.624942,0.553258,0.393817,0.953462,0.035437,0.030303,0.051817,0.639068,0.870381,0.401359,0.503059,0.043866,0.944060,0.962645,0.876272,0.511247,0.928551,0.060193,0.937401,0.033606,0.403305,0.049309,0.035960,0.965751,0.067980,0.449615,0.472282,0.068062,0.958774,0.514518,0.431195,0.107549,1
TCGA.O2.A52W.01,260.3332,789.3606,854.3794,388.5008,2354.1107,985.4917,52.7243,81.4992,5299.1080,5458.3557,751.2090,1027.9420,49.4358,890.9189,1089.1994,3845.7818,997.2703,537.8829,3266.5234,675.9807,195.0564,679.2047,1721.6550,65.5562,296.0774,178.2053,1082.1762,219.7743,472.3267,365.9323,13.4336,522.8372,14.5083,396.0236,20.4191,304.2826,645.8893,565.2875,162.3536,1877.4852,...,0.884849,0.946784,0.021832,0.968483,0.936312,0.460789,0.491155,0.547796,0.064783,0.360556,0.410325,0.023759,0.020351,0.621506,0.350479,0.118179,0.016919,0.043427,0.957663,0.029629,0.454733,0.573819,0.015640,0.071451,0.966129,0.961666,0.560211,0.024349,0.035562,0.970618,0.971247,0.265095,0.573803,0.506155,0.546692,0.719723,0.530074,0.460235,0.931966,0
TCGA.O2.A5IB.01,160.1624,460.8626,569.2226,932.1086,201.2780,985.0905,14.9281,43.5304,16530.5698,26.8903,706.3365,1427.0501,456.0703,789.4036,835.7295,1313.0990,1310.7481,593.7167,5865.0160,934.5048,392.1725,365.8147,1921.9915,7.9872,105.1651,110.9159,228.1683,337.8594,501.5974,430.5112,4.5261,1315.4952,7.4547,431.8424,16.2407,497.7982,475.2396,688.7646,156.4297,758.7859,...,0.860961,0.024698,0.977430,0.969208,0.423610,0.446033,0.446225,0.960414,0.914394,0.079533,0.468437,0.349740,0.965156,0.036333,0.510305,0.491987,0.646609,0.962580,0.028517,0.029790,0.961354,0.488548,0.018625,0.967592,0.036266,0.529104,0.367031,0.024477,0.512597,0.340154,0.013990,0.031028,0.014438,0.019634,0.948490,0.945375,0.034040,0.501818,0.902703,0


# RECURSIVE FEATURE ELIMINATION (RFE)

DATA AGGREGATION (FEATURES & TARGETS)

In [None]:
features = df.iloc[:,1:-1] #Retrieves all rows (1:), leaves last column (,1:-1)
target = df.iloc[:,-1] #Retrieves all rows (1:), retrieves only last column (,-1)
 
print("Features size :", features.shape)
print("Target size :", target.shape)

Features size : (344, 18663)
Target size : (344,)


DATA STANDARDIZATION

In [None]:
#Setting all dataset into a range of 0 to 1
min_max_scaler = MinMaxScaler(feature_range =(0, 1))  
#Scaled feature 
features = pd.DataFrame(min_max_scaler.fit_transform(features))

SVM MODEL FITTING

In [None]:
#Set Parameter
C = 1.0
rfeIndex = nFeatures
#Create SVM model using a linear kernel
model = svm.SVC(kernel='linear', C=C).fit(features,target)
coef = model.coef_
#Print co-efficients of features
for i in range(0, nFeatures):
	print (features.columns[i],":", coef[0][i])

0 : 0.0052621938865834396
1 : 0.004724196502736533
2 : -0.0008045444658995377
3 : 0.0013365444368546196
4 : -0.0012319264457507926
5 : 0.0005289191811819271
6 : -0.0002920457844723238
7 : 0.0008010662479989481
8 : -0.010427997379774304
9 : -0.0017662166766826966
10 : 0.003157122305756302
11 : 0.006164249209782801
12 : 0.00570257718497879
13 : -0.000291065278650697
14 : 0.0016695012682625726
15 : 0.0024871373407416436
16 : 0.0033355335537760145
17 : 0.0007406759865389342
18 : -0.006107020108326404
19 : 0.0011216443877168692
20 : -0.0005399069093937127
21 : 0.002144840767372437
22 : 0.0007521372710282687
23 : 0.0048284158560122055
24 : -0.005093861462074068
25 : 0.0025312782132240275
26 : 0.0005520822454519066
27 : -0.002375210520809403
28 : 0.002247500608309786
29 : 0.002512722280889534
30 : -0.0016704616547441823
31 : 0.0004888987426029127
32 : -0.007713310124508072
33 : 0.0003413458873352524
34 : 0.006514150167761252
35 : 0.004271333210860708
36 : 0.00031360742387614125
37 : 0.0065915

RFE IMPLEMENTATION

In [None]:
#Find the minimum weight among features and eliminate the feature with the smallest weight
min = coef[0][0]

for j in range(6663): # To make sure only 12,000 features remain
  index = 0
  j+=1
  for i in range(0, rfeIndex): # Iterates until the final feature
	  if min > coef[0][i]:
		  index = index + 1
		  min = coef[0][i]

  if len(features.columns) == 1:
	  print ("After recursive elimination we have the", features.columns[index], "feature with a score of:", min)

  else:
	  print ("Lowest feature weight is for", features.columns[index], "with a value of:", min)
	  print ("Dropping feature", features.columns[index])  
	  features.drop(features.columns[index], axis = 1, inplace = True)
	  rfeIndex = rfeIndex - 1
	  nFeatures = nFeatures - 1

FEATURES AFTER RFE

In [None]:
print("Dataset size after Feature Selection:",features.shape)

NORMALIZED FEATURES

In [None]:
features

# SYNTHETIC MINORITY OVERSAMPLING TECHNIQUE (SMOTE)

CLASS IMBALANCE COUNTER

In [None]:
from collections import Counter
counter = Counter(df['Class'])
counter

CLASS IMBALANCE PLOT (VISUALIZATION)

In [None]:
sns.set(style="darkgrid")
ax = sns.countplot(df['Class'])
ax.set(xlabel='Class', ylabel='Frequency')
plt.show()

DATA SPLITTING

In [None]:
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.25, random_state=42)
print ("X TRAIN DATA SHAPE: ", x_train.shape)
print ("X TEST DATA SHAPE: ", x_test.shape)
print ("Y TRAIN DATA SHAPE: ", y_train.shape)
print ("Y TEST DATA SHAPE: ", y_test.shape)

SMOTE MODEL FITTING

In [None]:
sm = SMOTE(k_neighbors=1, ratio=1, random_state=random_seed, kind='borderline1')
x_train, y_train = sm.fit_sample(x_train, y_train)
print ('Shape of oversampled data: {}'.format(x_train.shape))
print ('Shape of Y: {}'.format(y_train.shape))

BALANCED DATA VISUALIZATION

In [None]:
sns.set(style="darkgrid")
ax = sns.countplot(y_train)
ax.set(xlabel='Class', ylabel='Frequency')
plt.title('Balanced training data')
plt.show()

CLASS (TARGET) AFTER SMOTE

In [None]:
print('Resampled dataset shape for Train {}'.format(Counter(y_train)))
print('Normal validation dataset shape for Test {}'.format(Counter(y_test)))

# VARIATIONAL AUTOENCODER (VAE)

HYPER-PARAMETER SETTING

In [None]:
# Set hyper parameters
original_dim = features.shape[1]
latent_dim = 100

batch_size = 4 
epochs = 100 
learning_rate =  0.0005

epsilon_std = 1.0
beta = K.variable(0) 
kappa = 1 

VAE ENCODER & HIDDEN LAYER

In [None]:
# Input place holder for omics data with specific input size
omics_input = Input(shape=(original_dim, ))

# Input layer is compressed into a mean and log variance vector of size `latent_dim`
# Each layer is initialized with glorot uniform weights and each step (dense connections,
# batch norm, and relu activation) are funneled separately
# Each vector of length `latent_dim` are connected to the omics input tensor
z_mean_dense_linear = Dense(latent_dim, kernel_initializer='glorot_uniform', kernel_regularizer = regularizers.l2(0.01))(omics_input)
z_mean_dense_batchnorm = BatchNormalization()(z_mean_dense_linear)
z_mean_encoded = Activation('relu')(z_mean_dense_batchnorm)

z_log_var_dense_linear = Dense(latent_dim, kernel_initializer='glorot_uniform', kernel_regularizer = regularizers.l2(0.01))(omics_input)
z_log_var_dense_batchnorm = BatchNormalization()(z_log_var_dense_linear)
z_log_var_encoded = Activation('relu')(z_log_var_dense_batchnorm)

# return the encoded and randomly sampled z vector
# Takes two keras layers as input to the custom sampling function layer with a `latent_dim` output
z = Lambda(sampling, output_shape=(latent_dim, ))([z_mean_encoded, z_log_var_encoded])

VAE DECODER LAYER

In [None]:
# The decoding layer is much simpler with a single layer and sigmoid activation
decoder_to_reconstruct = Dense(original_dim, kernel_initializer='glorot_uniform',kernel_regularizer = regularizers.l2(0.01), activation='sigmoid')
omics_reconstruct = decoder_to_reconstruct(z)

VAE MODEL LAYER COMPILATION

In [None]:
adam = optimizers.Adam(lr=learning_rate)
vae_layer = CustomVariationalLayer()([omics_input, omics_reconstruct])
vae = Model(omics_input, vae_layer)
vae.compile(optimizer = adam, loss = 'binary_crossentropy', loss_weights = [beta])
vae.summary()

TRAINING DATA FORMATTING (SMOTE TO VAE FITTING)

In [None]:
X = pd.DataFrame(x_train) #Now equal formatting with x_test
Y = pd.Series(y_train) #Now equal formatting with y_test

VAE MODEL FITTING

In [None]:
%%time
hist = vae.fit(np.array(X,Y),np.array(X,Y),
               shuffle=True,
               epochs=epochs,
               verbose=1,
               batch_size=batch_size,
               validation_data=(np.array(x_test,y_test), np.array(x_test,y_test), None),
               callbacks=[WarmUpCallback(beta, kappa),
                          TQDMNotebookCallback(leave_inner=True, leave_outer=True)])

PRE-TRAINED VAE MODEL LOSS (VISUALIZATION)

In [None]:
plt.plot(hist.history['loss'])
plt.plot(hist.history['val_loss'])
plt.title('Pre-trained VAE Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper right')
fig = plt.gcf()
plt.show()

SAVE TRAINED VAE MODEL LAYERS

In [None]:
# DEFINE ENCODER
encoder = Model(omics_input, z)
fold_count = 100

# SAVE THE ENCODER
from keras.models import model_from_json
model_json = encoder.to_json()
with open("encoder" + str(fold_count) + ".json", "w") as json_file:
    json_file.write(model_json)

encoder.save_weights("encoder" + str(fold_count) + ".h5")
print("Saved model to disk")

# DEFINE DECODER
decoder_input = Input(shape=(latent_dim, )) 
decoder_output = decoder_to_reconstruct(decoder_input)
decoder = Model(decoder_input, decoder_output)

ENCODING TEST DATA

In [None]:
# Encode test data into the latent representation - and save output
test_encoded = encoder.predict(x_test, batch_size = batch_size)
test_encoded_df = pd.DataFrame(test_encoded, index = x_test.index)

DECODING TEST DATA

In [None]:
# How well does the model reconstruct the input data
test_reconstructed = decoder.predict(np.array(test_encoded_df))
test_reconstructed_df = pd.DataFrame(test_reconstructed, index = x_test.index, columns = x_test.columns)

RECONSTRUCTION ERROR

In [None]:
from sklearn.metrics import mean_squared_error
reconstruction_error = mean_squared_error(np.array(x_test), np.array(test_reconstructed_df))
print("TEST RECONSTRUCTION ERROR: " + str(reconstruction_error))

RECONSTRUCTION ACCURACY

In [None]:
Accuracy_rate = 1
accuracy = Accuracy_rate - reconstruction_error
print("VAE MODEL ACCURACY: ",accuracy*100)

RANDOM FOREST CLASSIFIER

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=5, random_state=random_seed)
classifier = rf.fit(x_train, y_train)

CONFUSION MATRIX DECLARATION

In [None]:
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
preds = rf.predict(test_reconstructed_df)
confusion_matrix(y_test, preds)

CONFUSION MATRIX FUNCTION

In [None]:
def plot_confusion_matrix(cm,
                          target_names,
                          title='Confusion matrix',
                          cmap=None,
                          normalize=True):
    """
    given a sklearn confusion matrix (cm), make a nice plot

    Arguments
    ---------
    cm:           confusion matrix from sklearn.metrics.confusion_matrix

    target_names: given classification classes such as [0, 1, 2]
                  the class names, for example: ['high', 'medium', 'low']

    title:        the text to display at the top of the matrix

    cmap:         the gradient of the values displayed from matplotlib.pyplot.cm
                  see http://matplotlib.org/examples/color/colormaps_reference.html
                  plt.get_cmap('jet') or plt.cm.Blues

    normalize:    If False, plot the raw numbers
                  If True, plot the proportions

    Usage
    -----
    plot_confusion_matrix(cm           = cm,                  # confusion matrix created by
                                                              # sklearn.metrics.confusion_matrix
                          normalize    = True,                # show proportions
                          target_names = y_labels_vals,       # list of names of the classes
                          title        = best_estimator_name) # title of graph

    Citiation
    ---------
    http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html

    """
    import matplotlib.pyplot as plt
    import numpy as np
    import itertools

    accuracy = np.trace(cm) / float(np.sum(cm))
    misclass = 1 - accuracy

    if cmap is None:
        cmap = plt.get_cmap('Blues')

    plt.figure(figsize=(8, 6))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()

    if target_names is not None:
        tick_marks = np.arange(len(target_names))
        plt.xticks(tick_marks, target_names, rotation=45)
        plt.yticks(tick_marks, target_names)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]


    thresh = cm.max() / 1.5 if normalize else cm.max() / 2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if normalize:
            plt.text(j, i, "{:0.4f}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
        else:
            plt.text(j, i, "{:,}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")


    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))
    plt.show()


CONFUSION MATRIX (VISUALIZATION)

In [None]:
plot_confusion_matrix(cm           = np.array([[ 2,  10],
                                              [  36, 38]]), 
                      normalize    = False,
                      target_names = ['Non-Cancerous (0)', 'Cancerous (1)'],
                      title        = "Confusion Matrix")

CONFUSION METRICS ELEMENTS

In [None]:
def recall(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1(y_true, y_pred):
    precision2 = precision(y_true, y_pred)
    recall2 = recall(y_true, y_pred)
    return 2*((precision2 * recall2)/(precision2 + recall2 + K.epsilon()))

VAE MODEL - FINE TUNING

In [None]:
encode = []

Encoder_Weight = encode.append(encoder.layers[0].get_weights())

input1 = tf.keras.layers.Input(shape=(original_dim,))
Drop = tf.keras.layers.Dropout(0.2)(input1)

x1 = tf.keras.layers.Dense(6000, activation = 'relu', weights = Encoder_Weight, kernel_initializer= "glorot_uniform", kernel_regularizer=regularizers.l2(0.01))(Drop)
x2 = tf.keras.layers.Dense(6000, activation = 'relu', weights = Encoder_Weight, kernel_initializer= "glorot_uniform", kernel_regularizer=regularizers.l2(0.01))(Drop)      

Adding_layers = tf.keras.layers.Add()([x1, x2])    
x3 = tf.keras.layers.Dense(1, activation = 'sigmoid', kernel_initializer= "glorot_uniform", kernel_regularizer=regularizers.l2(0.01))(Adding_layers)

FT_model = tf.keras.models.Model(inputs = [input1], outputs = x3)
FT_model.compile(loss='binary_crossentropy', optimizer='Adam', metrics=['accuracy', recall, precision, f1])
FT_model.summary()
hist2 = FT_model.fit(test_reconstructed_df,y_test, epochs = 50, batch_size = 4, validation_data= (x_test,y_test))

VAE MODEL LOSS (FINE - TUNING)

In [None]:
plt.plot(hist2.history['loss'])
plt.plot(hist2.history['val_loss'])
plt.title('VAE Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper right')
fig = plt.gcf()
plt.show()