In [1]:
#importing libraries

import findspark
import pyspark
import pyspark.sql.functions as F
import pyspark
from pyspark.sql import SparkSession, SQLContext
from pyspark.ml import Pipeline,Transformer
from pyspark.ml.feature import Imputer,StandardScaler,StringIndexer,OneHotEncoder, VectorAssembler

from pyspark.sql.functions import *
from pyspark.sql.types import *
import numpy as np

from pyspark.ml.classification import LogisticRegression
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator



In [2]:
#creating spark session
findspark.init()
findspark.find()

spark = SparkSession.builder \
    .master("local[*]") \
    .appName("MQTT") \
    .getOrCreate()

spark = SparkSession.builder.appName("mqttProject").getOrCreate()
sc    = spark.sparkContext

sqlContext = SQLContext(sc)



In [58]:
train = spark.read.csv( (r"C:\Users\SAROJ SATHISH\Downloads\MMQT\train70_augmented.csv"),header=True, inferSchema= True)
test  = spark.read.csv( (r"C:\Users\SAROJ SATHISH\Downloads\MMQT\test30_augmented.csv"),header=True, inferSchema= True)
DF = train.union(test)
DF = DF.toDF(*(c.replace('.', '_') for c in DF.columns))
train = train.toDF(*(c.replace('.', '_') for c in train.columns))
test = test.toDF(*(c.replace('.', '_') for c in test.columns))

In [59]:
numeric_features = [feature[0] for feature in DF.dtypes if feature[1] not in ('string')]
string_features = [feature[0] for feature in DF.dtypes if feature[1] in ('string')]
to_drop =  ["mqtt_conflag_cleansess","mqtt_proto_len","mqtt_conflag_passwd","mqtt_qos"]

In [60]:
col_names = ['tcp_flags','tcp_time_delta','tcp_len','mqtt_conack_flags','mqtt_conack_flags_reserved','mqtt_conack_flags_sp',
 'mqtt_conack_val','mqtt_conflag_cleansess','mqtt_conflag_passwd','mqtt_conflag_qos','mqtt_conflag_reserved',
 'mqtt_conflag_retain','mqtt_conflag_uname','mqtt_conflag_willflag','mqtt_conflags','mqtt_dupflag','mqtt_hdrflags', 
 'mqtt_kalive', 'mqtt_len','mqtt_msg','mqtt_msgid', 'mqtt_msgtype', 'mqtt_proto_len', 'mqtt_qos', 'mqtt_retain',
 'mqtt_sub_qos', 'mqtt_suback_qos', 'mqtt_ver', 'mqtt_willmsg', 'mqtt_willmsg_len', 'mqtt_willtopic', 'mqtt_willtopic_len',
 'target']

nominal_cols = ['tcp_flags','mqtt_conack_flags','mqtt_conflags', 'mqtt_hdrflags', 'mqtt_msg', 'mqtt_protoname']

continuous_cols = ['tcp_time_delta', 'tcp_len', 'mqtt_conack_flags_reserved', 'mqtt_conack_flags_sp', 'mqtt_conack_val',
 'mqtt_conflag_cleansess', 'mqtt_conflag_passwd', 'mqtt_conflag_qos', 'mqtt_conflag_reserved', 'mqtt_conflag_retain',
 'mqtt_conflag_uname', 'mqtt_conflag_willflag', 'mqtt_dupflag', 'mqtt_kalive', 'mqtt_len', 'mqtt_msgid',
 'mqtt_msgtype', 'mqtt_proto_len', 'mqtt_qos', 'mqtt_retain', 'mqtt_sub_qos', 'mqtt_suback_qos', 'mqtt_ver',
 'mqtt_willmsg', 'mqtt_willmsg_len', 'mqtt_willtopic', 'mqtt_willtopic_len']

In [61]:
class OutcomeCreater_binary(Transformer): # this defines a transformer that creates the outcome column
    
    def __init__(self):
        super().__init__()
  
    def _transform(self, dataset):
        label_to_binary = udf(lambda name: 0.0 if name == 'legitimate' else 1.0)
        output_df = dataset.withColumn('outcome', label_to_binary(col('target'))).drop("target")  
        output_df = output_df.withColumn('outcome', col('outcome').cast(DoubleType()))
        return output_df
    
class OutcomeCreater_multi(Transformer): # this defines a transformer that creates the outcome column
    
    def __init__(self):
        super().__init__()
  
    def _transform(self, dataset):
        label_to_multiple = udf(lambda name: 0.0 if name == 'legitimate' else (1.0 if name == "flood" else(2.0 if name == "dos" else(3.0 if name == "bruteforce" else(4.0 if name == "slowite" else (5.0))))))
        output_df = dataset.withColumn('outcome', label_to_multiple(col('target'))).drop("target")  
        output_df = output_df.withColumn('outcome', col('outcome').cast(DoubleType()))
        return output_df
        
class FeatureTypeCaster(Transformer): # this transformer will cast the columns as appropriate types  
    def __init__(self):
        super().__init__()

    def _transform(self, dataset):
        output_df = dataset
        for col_name in continuous_cols:
            output_df = output_df.withColumn(col_name,col(col_name).cast(DoubleType()))

        return output_df
    
class ColumnDropper(Transformer): # this transformer drops unnecessary columns
    def __init__(self, columns_to_drop = None):
        super().__init__()
        self.columns_to_drop=columns_to_drop
    def _transform(self, dataset):
        output_df = dataset
        for col_name in self.columns_to_drop:
            output_df = output_df.drop(col_name)
            
        return output_df
    
def get_preprocess_pipeline(classification):
    # Stage where columns are casted as appropriate types
    stage_typecaster = FeatureTypeCaster()

    # Stage where nominal columns are transformed to index columns using StringIndexer
    nominal_id_cols = [x+"_index" for x in nominal_cols]
    nominal_onehot_cols = [x+"_encoded" for x in nominal_cols]
    stage_nominal_indexer = StringIndexer(inputCols = nominal_cols, outputCols = nominal_id_cols )
    
    # Stage where the index columns are further transformed using OneHotEncoder
    stage_nominal_onehot_encoder = OneHotEncoder(inputCols=nominal_id_cols, outputCols=nominal_onehot_cols)

    # Stage where all relevant features are assembled into a vector (and dropping a few)
    feature_cols = continuous_cols+nominal_onehot_cols
    corelated_cols_to_remove = to_drop
    
    for col_name in corelated_cols_to_remove:
        feature_cols.remove(col_name)
    stage_vector_assembler = VectorAssembler(inputCols=feature_cols, outputCol="vectorized_features")

    # Stage where we scale the columns
    stage_scaler = StandardScaler(inputCol= 'vectorized_features', outputCol= 'features')
    

    # Stage for creating the outcome column representing whether there is attack
    if(classification == "binary"): 
        stage_outcome = OutcomeCreater_binary()
    else:
        stage_outcome = OutcomeCreater_multi()

    # Removing all unnecessary columbs, only keeping the 'features' and 'outcome' columns
    stage_column_dropper = ColumnDropper(columns_to_drop = nominal_cols+nominal_id_cols+
        nominal_onehot_cols + continuous_cols + ['vectorized_features'])
    
    pipeline = Pipeline(stages=[stage_typecaster,stage_nominal_indexer,stage_nominal_onehot_encoder,
        stage_vector_assembler,stage_scaler,stage_outcome,stage_column_dropper])
    
    return pipeline 

### Multiclass classification

In [62]:
preprocess_pipeline = get_preprocess_pipeline("multi")
preprocess_pipeline_model = preprocess_pipeline.fit(train)

train_df = preprocess_pipeline_model.transform(train)
test_df = preprocess_pipeline_model.transform(test)

In [63]:
train_df= train_df.limit(1400)
test_df = test_df.limit(600)

In [64]:
train_df.show(15)

+--------------------+-------+
|            features|outcome|
+--------------------+-------+
|(50369,[0,1,12,14...|    0.0|
|(50369,[0,27,30,3...|    3.0|
|(50369,[0,24,30,3...|    1.0|
|(50369,[0,24,30,3...|    4.0|
|(50369,[0,1,12,14...|    1.0|
|(50369,[0,1,12,14...|    0.0|
|(50369,[0,1,12,14...|    0.0|
|(50369,[0,24,30,3...|    3.0|
|(50369,[0,1,12,14...|    0.0|
|(50369,[0,1,8,11,...|    3.0|
|(50369,[1,12,13,1...|    2.0|
|(50369,[0,1,12,14...|    0.0|
|(50369,[0,1,12,14...|    0.0|
|(50369,[0,28,30,3...|    5.0|
|(50369,[0,24,30,3...|    2.0|
+--------------------+-------+
only showing top 15 rows



In [65]:
to_array = udf(lambda v: v.toArray().tolist(), ArrayType(FloatType()))

df_train = train_df
df_validate,df_test = test_df.randomSplit([0.5,0.5])

In [66]:
df_train.show(15)

+--------------------+-------+
|            features|outcome|
+--------------------+-------+
|(50369,[0,1,12,14...|    0.0|
|(50369,[0,27,30,3...|    3.0|
|(50369,[0,24,30,3...|    1.0|
|(50369,[0,24,30,3...|    4.0|
|(50369,[0,1,12,14...|    1.0|
|(50369,[0,1,12,14...|    0.0|
|(50369,[0,1,12,14...|    0.0|
|(50369,[0,24,30,3...|    3.0|
|(50369,[0,1,12,14...|    0.0|
|(50369,[0,1,8,11,...|    3.0|
|(50369,[1,12,13,1...|    2.0|
|(50369,[0,1,12,14...|    0.0|
|(50369,[0,1,12,14...|    0.0|
|(50369,[0,28,30,3...|    5.0|
|(50369,[0,24,30,3...|    2.0|
+--------------------+-------+
only showing top 15 rows



In [67]:
df_train_pandas = df_train.withColumn('features', to_array('features')).toPandas()

In [68]:
df_validate_pandas = df_validate.withColumn('features', to_array('features')).toPandas()
df_test_pandas = df_test.withColumn('features', to_array('features')).toPandas()

In [69]:
import tensorflow as tf
from tensorflow import keras 

# Converting the pandas DataFrame to tensors
# Note we are using 3 data sets train, validate, test

x_train = tf.constant(np.array(df_train_pandas['features'].values.tolist()))
y_train = tf.constant(np.array(df_train_pandas['outcome'].values.tolist()))

x_validate = tf.constant(np.array(df_validate_pandas['features'].values.tolist()))
y_validate = tf.constant(np.array(df_validate_pandas['outcome'].values.tolist()))


x_test = tf.constant(np.array(df_test_pandas['features'].values.tolist()))
y_test = tf.constant(np.array(df_test_pandas['outcome'].values.tolist()))

### Shallow NN

In [70]:
import datetime
model_multiclass = keras.Sequential( [keras.layers.Dense(30,activation='relu'),
                           keras.layers.Dense(6)] )

model_multiclass.compile(optimizer = 'sgd',
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=[keras.metrics.SparseCategoricalAccuracy()])

log_dir = "logss/multiiclassfin/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

model_multiclass.fit(x_train,y_train, epochs = 20,validation_data=(x_validate,y_validate),verbose = 2, callbacks=[tensorboard_callback])

Epoch 1/20
44/44 - 1s - loss: 1.5188 - sparse_categorical_accuracy: 0.4907 - val_loss: 1.2771 - val_sparse_categorical_accuracy: 0.5425 - 1s/epoch - 31ms/step
Epoch 2/20
44/44 - 0s - loss: 1.0914 - sparse_categorical_accuracy: 0.5764 - val_loss: 1.0767 - val_sparse_categorical_accuracy: 0.6438 - 474ms/epoch - 11ms/step
Epoch 3/20
44/44 - 1s - loss: 0.9088 - sparse_categorical_accuracy: 0.6914 - val_loss: 0.9598 - val_sparse_categorical_accuracy: 0.7124 - 590ms/epoch - 13ms/step
Epoch 4/20
44/44 - 1s - loss: 0.7885 - sparse_categorical_accuracy: 0.7750 - val_loss: 0.8747 - val_sparse_categorical_accuracy: 0.7549 - 684ms/epoch - 16ms/step
Epoch 5/20
44/44 - 1s - loss: 0.7007 - sparse_categorical_accuracy: 0.8143 - val_loss: 0.8032 - val_sparse_categorical_accuracy: 0.7876 - 635ms/epoch - 14ms/step
Epoch 6/20
44/44 - 1s - loss: 0.6430 - sparse_categorical_accuracy: 0.8229 - val_loss: 0.7722 - val_sparse_categorical_accuracy: 0.7876 - 575ms/epoch - 13ms/step
Epoch 7/20
44/44 - 0s - loss: 0

<keras.callbacks.History at 0x1ec8533ca90>

#### Hyper parameter tuning with cross validation

##### Shuffling

In [71]:
import tensorflow as tf
from tensorflow import keras 
train = tf.concat([x_train,tf.reshape(y_train,[-1,1])],1)
train_shuffle = tf.random.shuffle(train)
x_train_shuffle = train_shuffle[:,0:tf.shape(x_train)[1]]
y_train_shuffle = train_shuffle[:,tf.shape(x_train)[1]]

##### Hyperparameters

In [72]:
from tensorboard.plugins.hparams import api as hp

HP_WIDTH = hp.HParam('NN_width', hp.Discrete([20,30,40]))
HP_DEPTH = hp.HParam('NN_depth', hp.Discrete([1,2]))


with tf.summary.create_file_writer('logs1483/hparams_tuning').as_default():
  hp.hparams_config(
    hparams=[HP_WIDTH, HP_DEPTH],
    metrics=[hp.Metric('Accuracy')],
  )

##### CrossValidation

In [73]:
def CV_model(hparams,logdir, k, current_best, d, w):
    indiceslist = []
    
    for i in range(k-1):
        indices = tf.range(i * ((tf.shape(x_train_shuffle)[0])//k), (i + 1)* ((tf.shape(x_train_shuffle)[0])//k),1).numpy().tolist()
        indiceslist.append([indices])
        
    accuracy = 0
    # combining whatever remaining after k-1 splits (to account if length of dataset is not divisible by k)
    
    indices = tf.range((k-1) * ((tf.shape(x_train_shuffle)[0])//k), (tf.shape(x_train_shuffle)[0]),1).numpy().tolist()
    
    indiceslist.append([indices]) ## indiceslist to divide train and validate
    
    for i in range(k):
        print("\nSplit no",i+1)
        model = keras.Sequential()
        for _ in range(hparams[HP_DEPTH]):
            model.add(keras.layers.Dense(hparams[HP_WIDTH],activation='relu'))
        model.add(keras.layers.Dense(6))
        model.compile(
          optimizer=keras.optimizers.SGD(),
          loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True),
          metrics=[keras.metrics.SparseCategoricalAccuracy()])
        
        b = indiceslist[i][0] 
        a = [int(item) for item in b]

        x_validate = tf.gather(x_train_shuffle,a)

        y_validate = tf.gather(y_train_shuffle,a)
        
        z = []
        
        for j in range(k):
            
            if j != i:                     ## Make sure validation and train set are different
                z =  z + indiceslist[j][0]
                
        a = [int(item) for item in z]
        
        x_train = tf.gather(x_train_shuffle, a)
        y_train = tf.gather(y_train_shuffle, a)
        
        print("\nTraining")
        
        history = model.fit(x_train, y_train, epochs= 10,validation_data = (x_validate,y_validate), verbose = 2)
        if np.max(history.history["val_sparse_categorical_accuracy"]) > current_best:
                  current_best = np.max(history.history["val_sparse_categorical_accuracy"])
                  d = hparams[HP_DEPTH]
                  w = hparams[HP_WIDTH]
        accuracy = accuracy + np.max(history.history["val_sparse_categorical_accuracy"])
    
    return accuracy/k, d, w, current_best

In [74]:
k =3 
current_best = 0
d,w = 0,0
for hp_width in HP_WIDTH.domain.values:
  for hp_depth in (HP_DEPTH.domain.values):
    hparams = {
        HP_WIDTH: hp_width,
        HP_DEPTH: hp_depth,
    }
    run_name = f"run-WIDTH{int(hparams[HP_WIDTH])}-DEPTH{hparams[HP_DEPTH]}"
    print('--- Starting trial: %s' % run_name)
    print({h.name: hparams[h] for h in hparams})

    run_dir = 'logs1483/hparams_tuning/' + run_name
    accuracy, d, w, current_best = CV_model(hparams,run_dir, k, current_best, d, w)

    with tf.summary.create_file_writer(run_dir).as_default():
      hp.hparams(hparams)  # record the values used in this trial
      tf.summary.scalar("Accuracy", accuracy, step=1)

--- Starting trial: run-WIDTH20-DEPTH1
{'NN_width': 20, 'NN_depth': 1}

Split no 1

Training
Epoch 1/10
30/30 - 2s - loss: 1.6092 - sparse_categorical_accuracy: 0.4690 - val_loss: 1.4310 - val_sparse_categorical_accuracy: 0.4957 - 2s/epoch - 75ms/step
Epoch 2/10
30/30 - 0s - loss: 1.3009 - sparse_categorical_accuracy: 0.5482 - val_loss: 1.3341 - val_sparse_categorical_accuracy: 0.5172 - 270ms/epoch - 9ms/step
Epoch 3/10
30/30 - 0s - loss: 1.1680 - sparse_categorical_accuracy: 0.5675 - val_loss: 1.2631 - val_sparse_categorical_accuracy: 0.5300 - 282ms/epoch - 9ms/step
Epoch 4/10
30/30 - 0s - loss: 1.0873 - sparse_categorical_accuracy: 0.5953 - val_loss: 1.1997 - val_sparse_categorical_accuracy: 0.5472 - 406ms/epoch - 14ms/step
Epoch 5/10
30/30 - 0s - loss: 1.0168 - sparse_categorical_accuracy: 0.6113 - val_loss: 1.1490 - val_sparse_categorical_accuracy: 0.5536 - 281ms/epoch - 9ms/step
Epoch 6/10
30/30 - 0s - loss: 0.9593 - sparse_categorical_accuracy: 0.6253 - val_loss: 1.0973 - val_spa


Split no 3

Training
Epoch 1/10
30/30 - 1s - loss: 1.8612 - sparse_categorical_accuracy: 0.3637 - val_loss: 1.6369 - val_sparse_categorical_accuracy: 0.5150 - 952ms/epoch - 32ms/step
Epoch 2/10
30/30 - 0s - loss: 1.4723 - sparse_categorical_accuracy: 0.5172 - val_loss: 1.4513 - val_sparse_categorical_accuracy: 0.5128 - 282ms/epoch - 9ms/step
Epoch 3/10
30/30 - 0s - loss: 1.2503 - sparse_categorical_accuracy: 0.5172 - val_loss: 1.3367 - val_sparse_categorical_accuracy: 0.5171 - 309ms/epoch - 10ms/step
Epoch 4/10
30/30 - 0s - loss: 1.1123 - sparse_categorical_accuracy: 0.5719 - val_loss: 1.3155 - val_sparse_categorical_accuracy: 0.6410 - 308ms/epoch - 10ms/step
Epoch 5/10
30/30 - 0s - loss: 1.0145 - sparse_categorical_accuracy: 0.6642 - val_loss: 1.2005 - val_sparse_categorical_accuracy: 0.6581 - 316ms/epoch - 11ms/step
Epoch 6/10
30/30 - 0s - loss: 0.9329 - sparse_categorical_accuracy: 0.6824 - val_loss: 1.1473 - val_sparse_categorical_accuracy: 0.6453 - 360ms/epoch - 12ms/step
Epoch 7

30/30 - 0s - loss: 0.7286 - sparse_categorical_accuracy: 0.8266 - val_loss: 0.8666 - val_sparse_categorical_accuracy: 0.7682 - 341ms/epoch - 11ms/step

Split no 2

Training
Epoch 1/10
30/30 - 1s - loss: 1.6632 - sparse_categorical_accuracy: 0.4433 - val_loss: 1.5080 - val_sparse_categorical_accuracy: 0.4721 - 914ms/epoch - 30ms/step
Epoch 2/10
30/30 - 0s - loss: 1.3953 - sparse_categorical_accuracy: 0.5289 - val_loss: 1.3360 - val_sparse_categorical_accuracy: 0.5086 - 277ms/epoch - 9ms/step
Epoch 3/10
30/30 - 0s - loss: 1.1949 - sparse_categorical_accuracy: 0.5407 - val_loss: 1.2623 - val_sparse_categorical_accuracy: 0.5064 - 272ms/epoch - 9ms/step
Epoch 4/10
30/30 - 0s - loss: 1.1018 - sparse_categorical_accuracy: 0.5899 - val_loss: 1.2097 - val_sparse_categorical_accuracy: 0.5365 - 279ms/epoch - 9ms/step
Epoch 5/10
30/30 - 0s - loss: 1.0096 - sparse_categorical_accuracy: 0.6328 - val_loss: 1.1127 - val_sparse_categorical_accuracy: 0.6330 - 304ms/epoch - 10ms/step
Epoch 6/10
30/30 - 0

Epoch 10/10
30/30 - 0s - loss: 0.6307 - sparse_categorical_accuracy: 0.8208 - val_loss: 1.0427 - val_sparse_categorical_accuracy: 0.7500 - 295ms/epoch - 10ms/step
--- Starting trial: run-WIDTH40-DEPTH2
{'NN_width': 40, 'NN_depth': 2}

Split no 1

Training
Epoch 1/10
30/30 - 1s - loss: 1.6375 - sparse_categorical_accuracy: 0.4754 - val_loss: 1.5125 - val_sparse_categorical_accuracy: 0.4807 - 816ms/epoch - 27ms/step
Epoch 2/10
30/30 - 0s - loss: 1.3237 - sparse_categorical_accuracy: 0.5343 - val_loss: 1.4048 - val_sparse_categorical_accuracy: 0.4957 - 378ms/epoch - 13ms/step
Epoch 3/10
30/30 - 0s - loss: 1.1688 - sparse_categorical_accuracy: 0.5482 - val_loss: 1.3206 - val_sparse_categorical_accuracy: 0.5322 - 345ms/epoch - 12ms/step
Epoch 4/10
30/30 - 0s - loss: 1.0581 - sparse_categorical_accuracy: 0.5996 - val_loss: 1.2270 - val_sparse_categorical_accuracy: 0.6116 - 449ms/epoch - 15ms/step
Epoch 5/10
30/30 - 0s - loss: 0.9702 - sparse_categorical_accuracy: 0.6724 - val_loss: 1.1792 - 

##### Best parameters

In [75]:
print("Tuned Depth for shallow NN = ",d)
print("Tuned Widhth for shallow NN = ",w)

Tuned Depth for shallow NN =  2
Tuned Widhth for shallow NN =  20


#### Tuned Hyper parameters

In [76]:
model_multiclass = keras.Sequential( [keras.layers.Dense(20,activation='relu'),keras.layers.Dense(20,activation='relu'),
                           keras.layers.Dense(6)] )

model_multiclass.compile(optimizer = 'sgd',
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=[keras.metrics.SparseCategoricalAccuracy()])

log_dir = "logss/multiiclassfin/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

model_multiclass.fit(x_train,y_train, epochs = 20,validation_data=(x_validate,y_validate),verbose = 2, callbacks=[tensorboard_callback])

Epoch 1/20
44/44 - 1s - loss: 1.6055 - sparse_categorical_accuracy: 0.4179 - val_loss: 1.3592 - val_sparse_categorical_accuracy: 0.5131 - 860ms/epoch - 20ms/step
Epoch 2/20
44/44 - 0s - loss: 1.1363 - sparse_categorical_accuracy: 0.5914 - val_loss: 1.1916 - val_sparse_categorical_accuracy: 0.6536 - 418ms/epoch - 10ms/step
Epoch 3/20
44/44 - 0s - loss: 0.9574 - sparse_categorical_accuracy: 0.6921 - val_loss: 1.0985 - val_sparse_categorical_accuracy: 0.6634 - 453ms/epoch - 10ms/step
Epoch 4/20
44/44 - 0s - loss: 0.8394 - sparse_categorical_accuracy: 0.7257 - val_loss: 1.0640 - val_sparse_categorical_accuracy: 0.6961 - 404ms/epoch - 9ms/step
Epoch 5/20
44/44 - 0s - loss: 0.7541 - sparse_categorical_accuracy: 0.7657 - val_loss: 1.0277 - val_sparse_categorical_accuracy: 0.7222 - 414ms/epoch - 9ms/step
Epoch 6/20
44/44 - 0s - loss: 0.6978 - sparse_categorical_accuracy: 0.7864 - val_loss: 1.0115 - val_sparse_categorical_accuracy: 0.7255 - 437ms/epoch - 10ms/step
Epoch 7/20
44/44 - 0s - loss: 

<keras.callbacks.History at 0x1ed0e48f9a0>

#### Evaluate on Test set

In [77]:
print("Evaluate on test data")
results = model_multiclass.evaluate(x_test, y_test)
print("test accuracy = ", results[1]);

Evaluate on test data
test accuracy =  0.7653061151504517


### Deep NN

In [78]:
import datetime
model_multiclass = keras.Sequential( [keras.layers.Dense(30,activation='relu'),keras.layers.Dense(30,activation='relu'),
                                      keras.layers.Dense(20,activation='relu'),keras.layers.Dense(20,activation='relu'),
                           keras.layers.Dense(6)] )

model_multiclass.compile(optimizer = 'sgd',
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=[keras.metrics.SparseCategoricalAccuracy()])

log_dir = "logss/multiiclassfin/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

model_multiclass.fit(x_train,y_train, epochs = 20,validation_data=(x_validate,y_validate),verbose = 2, callbacks=[tensorboard_callback])

Epoch 1/20
44/44 - 1s - loss: 1.6440 - sparse_categorical_accuracy: 0.4771 - val_loss: 1.5313 - val_sparse_categorical_accuracy: 0.5131 - 918ms/epoch - 21ms/step
Epoch 2/20
44/44 - 0s - loss: 1.4008 - sparse_categorical_accuracy: 0.5243 - val_loss: 1.4094 - val_sparse_categorical_accuracy: 0.5131 - 477ms/epoch - 11ms/step
Epoch 3/20
44/44 - 0s - loss: 1.2963 - sparse_categorical_accuracy: 0.5243 - val_loss: 1.3223 - val_sparse_categorical_accuracy: 0.5196 - 493ms/epoch - 11ms/step
Epoch 4/20
44/44 - 0s - loss: 1.2031 - sparse_categorical_accuracy: 0.5250 - val_loss: 1.2587 - val_sparse_categorical_accuracy: 0.5131 - 466ms/epoch - 11ms/step
Epoch 5/20
44/44 - 0s - loss: 1.1179 - sparse_categorical_accuracy: 0.5257 - val_loss: 1.1928 - val_sparse_categorical_accuracy: 0.5261 - 446ms/epoch - 10ms/step
Epoch 6/20
44/44 - 1s - loss: 1.0424 - sparse_categorical_accuracy: 0.5436 - val_loss: 1.1451 - val_sparse_categorical_accuracy: 0.5523 - 551ms/epoch - 13ms/step
Epoch 7/20
44/44 - 1s - loss

<keras.callbacks.History at 0x1ed51970c40>

##### Cross validation function

In [79]:
def CV_Deep_model(hparams,logdir, k, current_best, d, w):
    indiceslist = []
    
    for i in range(k-1):
        indices = tf.range(i * ((tf.shape(x_train_shuffle)[0])//k), (i + 1)* ((tf.shape(x_train_shuffle)[0])//k),1).numpy().tolist()
        indiceslist.append([indices])
        
    accuracy = 0
    # combining whatever remaining after k-1 splits (to account if length of dataset is not divisible by k)
    
    indices = tf.range((k-1) * ((tf.shape(x_train_shuffle)[0])//k), (tf.shape(x_train_shuffle)[0]),1).numpy().tolist()
    
    indiceslist.append([indices]) ## indiceslist to divide train and validate
    
    for i in range(k):
        print("\nSplit no",i+1)
        model = keras.Sequential()
        for _ in range(hparams[HP_DEPTH]):
            model.add(keras.layers.Dense(hparams[HP_WIDTH],activation='relu'))
        model.add(keras.layers.Dense(6))
        model.compile(
          optimizer=keras.optimizers.SGD(),
          loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True),
          metrics=[keras.metrics.SparseCategoricalAccuracy()])
        
        b = indiceslist[i][0] 
        a = [int(item) for item in b]

        x_validate = tf.gather(x_train_shuffle,a)

        y_validate = tf.gather(y_train_shuffle,a)
        
        z = []
        
        for j in range(k):
            
            if j != i:                     ## Make sure validation and train set are different
                z =  z + indiceslist[j][0]
                
        a = [int(item) for item in z]
        
        x_train = tf.gather(x_train_shuffle, a)
        y_train = tf.gather(y_train_shuffle, a)
        
        print("\nTraining")
        
        history = model.fit(x_train, y_train, epochs= 10,validation_data = (x_validate,y_validate), verbose = 2)
        if np.max(history.history["val_sparse_categorical_accuracy"]) > current_best:
                  current_best = np.max(history.history["val_sparse_categorical_accuracy"])
                  d = hparams[HP_DEPTH]
                  w = hparams[HP_WIDTH]
        accuracy = accuracy + np.max(history.history["val_sparse_categorical_accuracy"])
    
    return accuracy/k, d, w, current_best

#### Shuffling

In [80]:
train = tf.concat([x_train,tf.reshape(y_train,[-1,1])],1)
train_shuffle = tf.random.shuffle(train)
x_train_shuffle = train_shuffle[:,0:tf.shape(x_train)[1]]
y_train_shuffle = train_shuffle[:,tf.shape(x_train)[1]]

##### Hyperparameters

In [81]:
from tensorboard.plugins.hparams import api as hp

HP_WIDTH = hp.HParam('NN_width', hp.Discrete([20,30,40]))
HP_DEPTH = hp.HParam('NN_depth', hp.Discrete([4,5,6]))


with tf.summary.create_file_writer('logs1483/hparams_tuning').as_default():
  hp.hparams_config(
    hparams=[HP_WIDTH, HP_DEPTH],
    metrics=[hp.Metric('Accuracy')],
  )

In [82]:
k =3 
current_best = 0
d,w = 0,0
for hp_width in HP_WIDTH.domain.values:
  for hp_depth in (HP_DEPTH.domain.values):
    hparams = {
        HP_WIDTH: hp_width,
        HP_DEPTH: hp_depth,
    }
    run_name = f"run-WIDTH{int(hparams[HP_WIDTH])}-DEPTH{hparams[HP_DEPTH]}"
    print('--- Starting trial: %s' % run_name)
    print({h.name: hparams[h] for h in hparams})

    run_dir = 'logs1483/hparams_tuning/' + run_name
    accuracy, d, w, current_best = CV_Deep_model(hparams,run_dir, k, current_best, d, w)

    with tf.summary.create_file_writer(run_dir).as_default():
      hp.hparams(hparams)  # record the values used in this trial
      tf.summary.scalar("Accuracy", accuracy, step=1)

--- Starting trial: run-WIDTH20-DEPTH4
{'NN_width': 20, 'NN_depth': 4}

Split no 1

Training
Epoch 1/10
30/30 - 1s - loss: 1.7969 - sparse_categorical_accuracy: 0.4304 - val_loss: 1.7312 - val_sparse_categorical_accuracy: 0.4807 - 707ms/epoch - 24ms/step
Epoch 2/10
30/30 - 0s - loss: 1.6487 - sparse_categorical_accuracy: 0.4936 - val_loss: 1.6003 - val_sparse_categorical_accuracy: 0.5021 - 270ms/epoch - 9ms/step
Epoch 3/10
30/30 - 0s - loss: 1.4741 - sparse_categorical_accuracy: 0.5150 - val_loss: 1.4956 - val_sparse_categorical_accuracy: 0.5086 - 269ms/epoch - 9ms/step
Epoch 4/10
30/30 - 0s - loss: 1.3771 - sparse_categorical_accuracy: 0.5203 - val_loss: 1.4454 - val_sparse_categorical_accuracy: 0.5086 - 264ms/epoch - 9ms/step
Epoch 5/10
30/30 - 0s - loss: 1.3267 - sparse_categorical_accuracy: 0.5225 - val_loss: 1.4126 - val_sparse_categorical_accuracy: 0.5086 - 255ms/epoch - 8ms/step
Epoch 6/10
30/30 - 0s - loss: 1.2822 - sparse_categorical_accuracy: 0.5310 - val_loss: 1.3724 - val_s


Split no 3

Training
Epoch 1/10
30/30 - 2s - loss: 1.7266 - sparse_categorical_accuracy: 0.4560 - val_loss: 1.6768 - val_sparse_categorical_accuracy: 0.4722 - 2s/epoch - 75ms/step
Epoch 2/10
30/30 - 0s - loss: 1.5984 - sparse_categorical_accuracy: 0.5343 - val_loss: 1.6013 - val_sparse_categorical_accuracy: 0.4808 - 267ms/epoch - 9ms/step
Epoch 3/10
30/30 - 0s - loss: 1.5001 - sparse_categorical_accuracy: 0.5397 - val_loss: 1.5364 - val_sparse_categorical_accuracy: 0.4808 - 259ms/epoch - 9ms/step
Epoch 4/10
30/30 - 0s - loss: 1.4152 - sparse_categorical_accuracy: 0.5397 - val_loss: 1.5077 - val_sparse_categorical_accuracy: 0.4637 - 253ms/epoch - 8ms/step
Epoch 5/10
30/30 - 0s - loss: 1.3782 - sparse_categorical_accuracy: 0.5386 - val_loss: 1.4896 - val_sparse_categorical_accuracy: 0.4615 - 269ms/epoch - 9ms/step
Epoch 6/10
30/30 - 0s - loss: 1.3344 - sparse_categorical_accuracy: 0.5397 - val_loss: 1.4700 - val_sparse_categorical_accuracy: 0.4551 - 288ms/epoch - 10ms/step
Epoch 7/10
30


Split no 2

Training
Epoch 1/10
30/30 - 1s - loss: 1.7067 - sparse_categorical_accuracy: 0.4325 - val_loss: 1.6308 - val_sparse_categorical_accuracy: 0.5064 - 891ms/epoch - 30ms/step
Epoch 2/10
30/30 - 0s - loss: 1.5571 - sparse_categorical_accuracy: 0.5075 - val_loss: 1.4836 - val_sparse_categorical_accuracy: 0.5343 - 277ms/epoch - 9ms/step
Epoch 3/10
30/30 - 0s - loss: 1.4219 - sparse_categorical_accuracy: 0.5150 - val_loss: 1.3948 - val_sparse_categorical_accuracy: 0.5279 - 331ms/epoch - 11ms/step
Epoch 4/10
30/30 - 0s - loss: 1.3519 - sparse_categorical_accuracy: 0.5150 - val_loss: 1.3510 - val_sparse_categorical_accuracy: 0.5236 - 277ms/epoch - 9ms/step
Epoch 5/10
30/30 - 0s - loss: 1.2983 - sparse_categorical_accuracy: 0.5139 - val_loss: 1.3223 - val_sparse_categorical_accuracy: 0.5193 - 303ms/epoch - 10ms/step
Epoch 6/10
30/30 - 0s - loss: 1.2499 - sparse_categorical_accuracy: 0.5150 - val_loss: 1.2696 - val_sparse_categorical_accuracy: 0.5193 - 308ms/epoch - 10ms/step
Epoch 7/

--- Starting trial: run-WIDTH30-DEPTH6
{'NN_width': 30, 'NN_depth': 6}

Split no 1

Training
Epoch 1/10
30/30 - 1s - loss: 1.7306 - sparse_categorical_accuracy: 0.3844 - val_loss: 1.6721 - val_sparse_categorical_accuracy: 0.4871 - 868ms/epoch - 29ms/step
Epoch 2/10
30/30 - 0s - loss: 1.6052 - sparse_categorical_accuracy: 0.4979 - val_loss: 1.5516 - val_sparse_categorical_accuracy: 0.5129 - 295ms/epoch - 10ms/step
Epoch 3/10
30/30 - 0s - loss: 1.4693 - sparse_categorical_accuracy: 0.5161 - val_loss: 1.4653 - val_sparse_categorical_accuracy: 0.5064 - 290ms/epoch - 10ms/step
Epoch 4/10
30/30 - 0s - loss: 1.3862 - sparse_categorical_accuracy: 0.5171 - val_loss: 1.4211 - val_sparse_categorical_accuracy: 0.5043 - 288ms/epoch - 10ms/step
Epoch 5/10
30/30 - 0s - loss: 1.3079 - sparse_categorical_accuracy: 0.5182 - val_loss: 1.3413 - val_sparse_categorical_accuracy: 0.5043 - 312ms/epoch - 10ms/step
Epoch 6/10
30/30 - 0s - loss: 1.2336 - sparse_categorical_accuracy: 0.5182 - val_loss: 1.2839 - v

Epoch 10/10
30/30 - 0s - loss: 0.8855 - sparse_categorical_accuracy: 0.7827 - val_loss: 1.3269 - val_sparse_categorical_accuracy: 0.5343 - 400ms/epoch - 13ms/step

Split no 3

Training
Epoch 1/10
30/30 - 1s - loss: 1.6407 - sparse_categorical_accuracy: 0.4764 - val_loss: 1.5269 - val_sparse_categorical_accuracy: 0.4722 - 1s/epoch - 35ms/step
Epoch 2/10
30/30 - 0s - loss: 1.3832 - sparse_categorical_accuracy: 0.5386 - val_loss: 1.4474 - val_sparse_categorical_accuracy: 0.4722 - 393ms/epoch - 13ms/step
Epoch 3/10
30/30 - 0s - loss: 1.2554 - sparse_categorical_accuracy: 0.5408 - val_loss: 1.3701 - val_sparse_categorical_accuracy: 0.4615 - 397ms/epoch - 13ms/step
Epoch 4/10
30/30 - 0s - loss: 1.1595 - sparse_categorical_accuracy: 0.5429 - val_loss: 1.3438 - val_sparse_categorical_accuracy: 0.4594 - 407ms/epoch - 14ms/step
Epoch 5/10
30/30 - 0s - loss: 1.0845 - sparse_categorical_accuracy: 0.5536 - val_loss: 1.2786 - val_sparse_categorical_accuracy: 0.4573 - 397ms/epoch - 13ms/step
Epoch 6/

30/30 - 0s - loss: 1.0529 - sparse_categorical_accuracy: 0.5396 - val_loss: 1.1383 - val_sparse_categorical_accuracy: 0.5451 - 343ms/epoch - 11ms/step
Epoch 10/10
30/30 - 0s - loss: 1.0361 - sparse_categorical_accuracy: 0.6156 - val_loss: 1.1253 - val_sparse_categorical_accuracy: 0.6137 - 442ms/epoch - 15ms/step

Split no 2

Training
Epoch 1/10
30/30 - 1s - loss: 1.7846 - sparse_categorical_accuracy: 0.4433 - val_loss: 1.7222 - val_sparse_categorical_accuracy: 0.5000 - 937ms/epoch - 31ms/step
Epoch 2/10
30/30 - 0s - loss: 1.6595 - sparse_categorical_accuracy: 0.4818 - val_loss: 1.6179 - val_sparse_categorical_accuracy: 0.5129 - 326ms/epoch - 11ms/step
Epoch 3/10
30/30 - 0s - loss: 1.5376 - sparse_categorical_accuracy: 0.5064 - val_loss: 1.5182 - val_sparse_categorical_accuracy: 0.5129 - 348ms/epoch - 12ms/step
Epoch 4/10
30/30 - 0s - loss: 1.4349 - sparse_categorical_accuracy: 0.5107 - val_loss: 1.4606 - val_sparse_categorical_accuracy: 0.5086 - 342ms/epoch - 11ms/step
Epoch 5/10
30/30

### Best Parameters

In [83]:
print("Tuned Depth for Deep NN = ",d)
print("Tuned Widhth for Deep NN = ",w)

Tuned Depth for Deep NN =  4
Tuned Widhth for Deep NN =  40


In [84]:
model_multiclass = keras.Sequential( [keras.layers.Dense(40,activation='relu'),keras.layers.Dense(40,activation='relu'),
                           keras.layers.Dense(40,activation='relu'), keras.layers.Dense(40,activation='relu'),
                           keras.layers.Dense(6)] )

model_multiclass.compile(optimizer = 'sgd',
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=[keras.metrics.SparseCategoricalAccuracy()])

log_dir = "logss/multiiclassfin/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

model_multiclass.fit(x_train,y_train, epochs = 20,validation_data=(x_validate,y_validate),verbose = 2, callbacks=[tensorboard_callback])

Epoch 1/20
44/44 - 1s - loss: 1.7159 - sparse_categorical_accuracy: 0.4200 - val_loss: 1.5677 - val_sparse_categorical_accuracy: 0.5000 - 1s/epoch - 25ms/step
Epoch 2/20
44/44 - 1s - loss: 1.4641 - sparse_categorical_accuracy: 0.5229 - val_loss: 1.3996 - val_sparse_categorical_accuracy: 0.5229 - 552ms/epoch - 13ms/step
Epoch 3/20
44/44 - 1s - loss: 1.3304 - sparse_categorical_accuracy: 0.5250 - val_loss: 1.3256 - val_sparse_categorical_accuracy: 0.5163 - 631ms/epoch - 14ms/step
Epoch 4/20
44/44 - 1s - loss: 1.2420 - sparse_categorical_accuracy: 0.5250 - val_loss: 1.2552 - val_sparse_categorical_accuracy: 0.5163 - 601ms/epoch - 14ms/step
Epoch 5/20
44/44 - 1s - loss: 1.1478 - sparse_categorical_accuracy: 0.5357 - val_loss: 1.1759 - val_sparse_categorical_accuracy: 0.5556 - 569ms/epoch - 13ms/step
Epoch 6/20
44/44 - 1s - loss: 1.0569 - sparse_categorical_accuracy: 0.6079 - val_loss: 1.0988 - val_sparse_categorical_accuracy: 0.6111 - 557ms/epoch - 13ms/step
Epoch 7/20
44/44 - 1s - loss: 0

<keras.callbacks.History at 0x1ecd9f1d2e0>

### Testing on Test set

In [85]:
print("Evaluate on test data")
results = model_multiclass.evaluate(x_test, y_test)
print("test accuracy = ", results[1]);

Evaluate on test data
test accuracy =  0.795918345451355


## Binary Classification

In [31]:
train = spark.read.csv( (r"C:\Users\SAROJ SATHISH\Downloads\MMQT\train70_augmented.csv"),header=True, inferSchema= True)
test  = spark.read.csv( (r"C:\Users\SAROJ SATHISH\Downloads\MMQT\test30_augmented.csv"),header=True, inferSchema= True)
DF = train.union(test)
DF = DF.toDF(*(c.replace('.', '_') for c in DF.columns))
train = train.toDF(*(c.replace('.', '_') for c in train.columns))
test = test.toDF(*(c.replace('.', '_') for c in test.columns))

In [32]:
preprocess_pipeline = get_preprocess_pipeline("binary")
preprocess_pipeline_model = preprocess_pipeline.fit(train)

train_df = preprocess_pipeline_model.transform(train)
test_df = preprocess_pipeline_model.transform(test)

In [33]:
train_df= train_df.limit(1400)
test_df = test_df.limit(600)

In [34]:
to_array = udf(lambda v: v.toArray().tolist(), ArrayType(FloatType()))

df_train = train_df
df_validate,df_test = test_df.randomSplit([0.5,0.5])

In [35]:
df_train_pandas = df_train.withColumn('features', to_array('features')).toPandas()
df_validate_pandas = df_validate.withColumn('features', to_array('features')).toPandas()
df_test_pandas = df_test.withColumn('features', to_array('features')).toPandas()

In [36]:
import tensorflow as tf
from tensorflow import keras 

# Converting the pandas DataFrame to tensors
# Note we are using 3 data sets train, validate, test

x_train = tf.constant(np.array(df_train_pandas['features'].values.tolist()))
y_train = tf.constant(np.array(df_train_pandas['outcome'].values.tolist()))

x_validate = tf.constant(np.array(df_validate_pandas['features'].values.tolist()))
y_validate = tf.constant(np.array(df_validate_pandas['outcome'].values.tolist()))


x_test = tf.constant(np.array(df_test_pandas['features'].values.tolist()))
y_test = tf.constant(np.array(df_test_pandas['outcome'].values.tolist()))

#### Shallow NN

In [37]:
import datetime
model = keras.Sequential( [keras.layers.Dense(30,activation='relu'),
                           keras.layers.Dense(1)] )
model.compile(optimizer = 'sgd',
    loss=keras.losses.BinaryCrossentropy(from_logits=True),
    metrics=[keras.metrics.AUC(from_logits=True),keras.metrics.BinaryAccuracy()])

model.fit(x_train,y_train, epochs = 10,validation_data=(x_validate,y_validate),verbose = 2)

Epoch 1/10
44/44 - 1s - loss: 0.8717 - auc: 0.7351 - binary_accuracy: 0.5057 - val_loss: 0.6212 - val_auc: 0.8857 - val_binary_accuracy: 0.7785 - 1s/epoch - 33ms/step
Epoch 2/10
44/44 - 0s - loss: 0.4807 - auc: 0.9146 - binary_accuracy: 0.7371 - val_loss: 0.5415 - val_auc: 0.9093 - val_binary_accuracy: 0.7427 - 361ms/epoch - 8ms/step
Epoch 3/10
44/44 - 0s - loss: 0.4105 - auc: 0.9310 - binary_accuracy: 0.8193 - val_loss: 0.4855 - val_auc: 0.9169 - val_binary_accuracy: 0.8078 - 363ms/epoch - 8ms/step
Epoch 4/10
44/44 - 0s - loss: 0.3647 - auc: 0.9370 - binary_accuracy: 0.8429 - val_loss: 0.4447 - val_auc: 0.9212 - val_binary_accuracy: 0.8274 - 363ms/epoch - 8ms/step
Epoch 5/10
44/44 - 0s - loss: 0.3302 - auc: 0.9536 - binary_accuracy: 0.8471 - val_loss: 0.4258 - val_auc: 0.9233 - val_binary_accuracy: 0.8241 - 362ms/epoch - 8ms/step
Epoch 6/10
44/44 - 0s - loss: 0.3072 - auc: 0.9532 - binary_accuracy: 0.8521 - val_loss: 0.4008 - val_auc: 0.9238 - val_binary_accuracy: 0.8339 - 393ms/epoch

<keras.callbacks.History at 0x1ed54114eb0>

#### Shuffling

In [38]:
import tensorflow as tf
from tensorflow import keras 
train = tf.concat([x_train,tf.reshape(y_train,[-1,1])],1)
train_shuffle = tf.random.shuffle(train)
x_train_shuffle = train_shuffle[:,0:tf.shape(x_train)[1]]
y_train_shuffle = train_shuffle[:,tf.shape(x_train)[1]]

#### Hyperparameters

In [42]:
from tensorboard.plugins.hparams import api as hp

HP_WIDTH = hp.HParam('NN_width', hp.Discrete([20,30,40]))
HP_DEPTH = hp.HParam('NN_depth', hp.Discrete([1,2]))


with tf.summary.create_file_writer('logs1483/hparams_tuning').as_default():
  hp.hparams_config(
    hparams=[HP_WIDTH, HP_DEPTH],
    metrics=[hp.Metric('Accuracy')],
  )

#### Cross Validation

In [43]:
def CV_Binary_model(hparams,logdir, k, current_best, d, w):
    indiceslist = []
    
    for i in range(k-1):
        indices = tf.range(i * ((tf.shape(x_train_shuffle)[0])//k), (i + 1)* ((tf.shape(x_train_shuffle)[0])//k),1).numpy().tolist()
        indiceslist.append([indices])
        
    accuracy = 0
    # combining whatever remaining after k-1 splits (to account if length of dataset is not divisible by k)
    
    indices = tf.range((k-1) * ((tf.shape(x_train_shuffle)[0])//k), (tf.shape(x_train_shuffle)[0]),1).numpy().tolist()
    
    indiceslist.append([indices]) ## indiceslist to divide train and validate
    
    for i in range(k):
        print("\nSplit no",i+1)
        model = keras.Sequential()
        for _ in range(hparams[HP_DEPTH]):
            model.add(keras.layers.Dense(hparams[HP_WIDTH],activation='relu'))
        model.add(keras.layers.Dense(1))
        model.compile(optimizer = 'sgd',
        loss=keras.losses.BinaryCrossentropy(from_logits=True),
        metrics=[keras.metrics.AUC(from_logits=True),keras.metrics.BinaryAccuracy()])
        
        b = indiceslist[i][0] 
        a = [int(item) for item in b]

        x_validate = tf.gather(x_train_shuffle,a)

        y_validate = tf.gather(y_train_shuffle,a)
        
        z = []
        
        for j in range(k):
            
            if j != i:                     ## Make sure validation and train set are different
                z =  z + indiceslist[j][0]
                
        a = [int(item) for item in z]
        
        x_train = tf.gather(x_train_shuffle, a)
        y_train = tf.gather(y_train_shuffle, a)
        
        print("\nTraining")
        
        history = model.fit(x_train, y_train, epochs= 10,validation_data = (x_validate,y_validate), verbose = 2)
        if np.max(history.history["val_binary_accuracy"]) > current_best:
                  current_best = np.max(history.history["val_binary_accuracy"])
                  d = hparams[HP_DEPTH]
                  w = hparams[HP_WIDTH]
        accuracy = accuracy + np.max(history.history["val_binary_accuracy"])
    
    return accuracy/k, d, w, current_best

#### Tuning

In [44]:
k =3 
current_best = 0
d,w = 0,0
for hp_width in HP_WIDTH.domain.values:
  for hp_depth in (HP_DEPTH.domain.values):
    hparams = {
        HP_WIDTH: hp_width,
        HP_DEPTH: hp_depth,
    }
    run_name = f"run-WIDTH{int(hparams[HP_WIDTH])}-DEPTH{hparams[HP_DEPTH]}"
    print('--- Starting trial: %s' % run_name)
    print({h.name: hparams[h] for h in hparams})

    run_dir = 'logs1483/hparams_tuning/' + run_name
    accuracy, d, w, current_best = CV_Binary_model(hparams,run_dir, k, current_best, d, w)

    with tf.summary.create_file_writer(run_dir).as_default():
      hp.hparams(hparams)  # record the values used in this trial
      tf.summary.scalar("Accuracy", accuracy, step=1)

--- Starting trial: run-WIDTH20-DEPTH1
{'NN_width': 20, 'NN_depth': 1}

Split no 1

Training
Epoch 1/10
30/30 - 1s - loss: 0.7125 - auc_2: 0.7373 - binary_accuracy: 0.5000 - val_loss: 0.6784 - val_auc_2: 0.8002 - val_binary_accuracy: 0.7082 - 1s/epoch - 43ms/step
Epoch 2/10
30/30 - 0s - loss: 0.5523 - auc_2: 0.8593 - binary_accuracy: 0.6167 - val_loss: 0.6024 - val_auc_2: 0.8666 - val_binary_accuracy: 0.5601 - 352ms/epoch - 12ms/step
Epoch 3/10
30/30 - 0s - loss: 0.4848 - auc_2: 0.9229 - binary_accuracy: 0.7088 - val_loss: 0.5502 - val_auc_2: 0.8861 - val_binary_accuracy: 0.7704 - 323ms/epoch - 11ms/step
Epoch 4/10
30/30 - 0s - loss: 0.4370 - auc_2: 0.9314 - binary_accuracy: 0.8116 - val_loss: 0.5111 - val_auc_2: 0.8917 - val_binary_accuracy: 0.7747 - 311ms/epoch - 10ms/step
Epoch 5/10
30/30 - 0s - loss: 0.4009 - auc_2: 0.9323 - binary_accuracy: 0.8266 - val_loss: 0.4825 - val_auc_2: 0.8944 - val_binary_accuracy: 0.7897 - 290ms/epoch - 10ms/step
Epoch 6/10
30/30 - 0s - loss: 0.3656 - a

Epoch 7/10
30/30 - 0s - loss: 0.3634 - auc_6: 0.9410 - binary_accuracy: 0.8276 - val_loss: 0.4269 - val_auc_6: 0.9000 - val_binary_accuracy: 0.8112 - 255ms/epoch - 9ms/step
Epoch 8/10
30/30 - 0s - loss: 0.3312 - auc_6: 0.9520 - binary_accuracy: 0.8426 - val_loss: 0.4035 - val_auc_6: 0.9102 - val_binary_accuracy: 0.8133 - 253ms/epoch - 8ms/step
Epoch 9/10
30/30 - 0s - loss: 0.3075 - auc_6: 0.9518 - binary_accuracy: 0.8490 - val_loss: 0.3899 - val_auc_6: 0.9094 - val_binary_accuracy: 0.8262 - 251ms/epoch - 8ms/step
Epoch 10/10
30/30 - 0s - loss: 0.2905 - auc_6: 0.9534 - binary_accuracy: 0.8587 - val_loss: 0.3730 - val_auc_6: 0.9174 - val_binary_accuracy: 0.8219 - 264ms/epoch - 9ms/step

Split no 3

Training
Epoch 1/10
30/30 - 1s - loss: 0.6555 - auc_7: 0.7613 - binary_accuracy: 0.4979 - val_loss: 0.6684 - val_auc_7: 0.7871 - val_binary_accuracy: 0.5598 - 947ms/epoch - 32ms/step
Epoch 2/10
30/30 - 0s - loss: 0.6031 - auc_7: 0.8939 - binary_accuracy: 0.6084 - val_loss: 0.6385 - val_auc_7: 

Epoch 3/10
30/30 - 0s - loss: 0.4643 - auc_11: 0.9248 - binary_accuracy: 0.7452 - val_loss: 0.6468 - val_auc_11: 0.8586 - val_binary_accuracy: 0.5644 - 388ms/epoch - 13ms/step
Epoch 4/10
30/30 - 0s - loss: 0.4042 - auc_11: 0.9383 - binary_accuracy: 0.8201 - val_loss: 0.5593 - val_auc_11: 0.8689 - val_binary_accuracy: 0.7747 - 304ms/epoch - 10ms/step
Epoch 5/10
30/30 - 0s - loss: 0.3599 - auc_11: 0.9429 - binary_accuracy: 0.8469 - val_loss: 0.5384 - val_auc_11: 0.8751 - val_binary_accuracy: 0.7811 - 276ms/epoch - 9ms/step
Epoch 6/10
30/30 - 0s - loss: 0.3244 - auc_11: 0.9529 - binary_accuracy: 0.8565 - val_loss: 0.5064 - val_auc_11: 0.8820 - val_binary_accuracy: 0.7918 - 289ms/epoch - 10ms/step
Epoch 7/10
30/30 - 0s - loss: 0.3065 - auc_11: 0.9489 - binary_accuracy: 0.8576 - val_loss: 0.5116 - val_auc_11: 0.8851 - val_binary_accuracy: 0.7854 - 269ms/epoch - 9ms/step
Epoch 8/10
30/30 - 0s - loss: 0.2873 - auc_11: 0.9514 - binary_accuracy: 0.8619 - val_loss: 0.5063 - val_auc_11: 0.8884 - 

Epoch 9/10
30/30 - 0s - loss: 0.2860 - auc_15: 0.9523 - binary_accuracy: 0.8469 - val_loss: 0.3835 - val_auc_15: 0.9139 - val_binary_accuracy: 0.8197 - 408ms/epoch - 14ms/step
Epoch 10/10
30/30 - 0s - loss: 0.2753 - auc_15: 0.9549 - binary_accuracy: 0.8576 - val_loss: 0.3694 - val_auc_15: 0.9144 - val_binary_accuracy: 0.8197 - 358ms/epoch - 12ms/step

Split no 3

Training
Epoch 1/10
30/30 - 1s - loss: 0.7693 - auc_16: 0.7359 - binary_accuracy: 0.4925 - val_loss: 0.7148 - val_auc_16: 0.8019 - val_binary_accuracy: 0.7009 - 1s/epoch - 37ms/step
Epoch 2/10
30/30 - 0s - loss: 0.5138 - auc_16: 0.9227 - binary_accuracy: 0.6921 - val_loss: 0.6747 - val_auc_16: 0.8408 - val_binary_accuracy: 0.5342 - 309ms/epoch - 10ms/step
Epoch 3/10
30/30 - 0s - loss: 0.4553 - auc_16: 0.9304 - binary_accuracy: 0.7833 - val_loss: 0.6144 - val_auc_16: 0.8584 - val_binary_accuracy: 0.7073 - 320ms/epoch - 11ms/step
Epoch 4/10
30/30 - 0s - loss: 0.4089 - auc_16: 0.9387 - binary_accuracy: 0.8294 - val_loss: 0.6183 -

#### Best parameters

In [45]:
print("Tuned Depth for Shallow NN = ",d)
print("Tuned Widhth for Shallow NN = ",w)

Tuned Depth for Shallow NN =  1
Tuned Widhth for Shallow NN =  40


In [46]:
model = keras.Sequential( [keras.layers.Dense(40,activation='relu'),
                           keras.layers.Dense(1)] )

model.compile(optimizer = 'sgd',
    loss=keras.losses.BinaryCrossentropy(from_logits=True),
    metrics=[keras.metrics.AUC(from_logits=True),keras.metrics.BinaryAccuracy()])

model.fit(x_train,y_train, epochs = 20,validation_data=(x_validate,y_validate),verbose = 2)

Epoch 1/20
44/44 - 1s - loss: 0.6737 - auc_20: 0.7742 - binary_accuracy: 0.5179 - val_loss: 0.6027 - val_auc_20: 0.8494 - val_binary_accuracy: 0.6059 - 1s/epoch - 23ms/step
Epoch 2/20
44/44 - 0s - loss: 0.4977 - auc_20: 0.9239 - binary_accuracy: 0.7157 - val_loss: 0.5292 - val_auc_20: 0.8884 - val_binary_accuracy: 0.7818 - 388ms/epoch - 9ms/step
Epoch 3/20
44/44 - 0s - loss: 0.4302 - auc_20: 0.9417 - binary_accuracy: 0.8343 - val_loss: 0.4846 - val_auc_20: 0.9033 - val_binary_accuracy: 0.7948 - 362ms/epoch - 8ms/step
Epoch 4/20
44/44 - 0s - loss: 0.3821 - auc_20: 0.9437 - binary_accuracy: 0.8364 - val_loss: 0.4313 - val_auc_20: 0.9113 - val_binary_accuracy: 0.8274 - 365ms/epoch - 8ms/step
Epoch 5/20
44/44 - 0s - loss: 0.3448 - auc_20: 0.9489 - binary_accuracy: 0.8479 - val_loss: 0.3993 - val_auc_20: 0.9149 - val_binary_accuracy: 0.8339 - 369ms/epoch - 8ms/step
Epoch 6/20
44/44 - 0s - loss: 0.3194 - auc_20: 0.9499 - binary_accuracy: 0.8500 - val_loss: 0.3764 - val_auc_20: 0.9223 - val_b

<keras.callbacks.History at 0x1ed542dbca0>

#### Testing on Test set

In [47]:
print("Evaluate on test data")
results = model.evaluate(x_test, y_test)
print("test accuracy = ", results[2]);

Evaluate on test data
test accuracy =  0.8532423377037048


#### Deep NN

In [48]:
import datetime
model = keras.Sequential( [keras.layers.Dense(30,activation='relu'),keras.layers.Dense(30,activation='relu'),
                           keras.layers.Dense(30,activation='relu'),keras.layers.Dense(30,activation='relu'),
                           keras.layers.Dense(1)] )
model.compile(optimizer = 'sgd',
    loss=keras.losses.BinaryCrossentropy(from_logits=True),
    metrics=[keras.metrics.AUC(from_logits=True),keras.metrics.BinaryAccuracy()])

model.fit(x_train,y_train, epochs = 10,validation_data=(x_validate,y_validate),verbose = 2)

Epoch 1/10
44/44 - 1s - loss: 0.6683 - auc_21: 0.7582 - binary_accuracy: 0.5179 - val_loss: 0.6549 - val_auc_21: 0.8201 - val_binary_accuracy: 0.5244 - 1s/epoch - 23ms/step
Epoch 2/10
44/44 - 0s - loss: 0.6326 - auc_21: 0.8711 - binary_accuracy: 0.5257 - val_loss: 0.6276 - val_auc_21: 0.8599 - val_binary_accuracy: 0.5244 - 343ms/epoch - 8ms/step
Epoch 3/10
44/44 - 0s - loss: 0.5916 - auc_21: 0.9007 - binary_accuracy: 0.5264 - val_loss: 0.5822 - val_auc_21: 0.8919 - val_binary_accuracy: 0.5179 - 346ms/epoch - 8ms/step
Epoch 4/10
44/44 - 0s - loss: 0.5303 - auc_21: 0.9232 - binary_accuracy: 0.6071 - val_loss: 0.5085 - val_auc_21: 0.9118 - val_binary_accuracy: 0.7427 - 346ms/epoch - 8ms/step
Epoch 5/10
44/44 - 0s - loss: 0.4444 - auc_21: 0.9402 - binary_accuracy: 0.7929 - val_loss: 0.4285 - val_auc_21: 0.9265 - val_binary_accuracy: 0.7948 - 338ms/epoch - 8ms/step
Epoch 6/10
44/44 - 0s - loss: 0.3628 - auc_21: 0.9483 - binary_accuracy: 0.8443 - val_loss: 0.3505 - val_auc_21: 0.9366 - val_b

<keras.callbacks.History at 0x1ed0f13c400>

#### Shuffling

In [49]:
import tensorflow as tf
from tensorflow import keras 
train = tf.concat([x_train,tf.reshape(y_train,[-1,1])],1)
train_shuffle = tf.random.shuffle(train)
x_train_shuffle = train_shuffle[:,0:tf.shape(x_train)[1]]
y_train_shuffle = train_shuffle[:,tf.shape(x_train)[1]]

#### Hyper parameters

In [50]:
from tensorboard.plugins.hparams import api as hp

HP_WIDTH = hp.HParam('NN_width', hp.Discrete([20,30,40]))
HP_DEPTH = hp.HParam('NN_depth', hp.Discrete([4,5,6]))


with tf.summary.create_file_writer('logs1483/hparams_tuning').as_default():
  hp.hparams_config(
    hparams=[HP_WIDTH, HP_DEPTH],
    metrics=[hp.Metric('Accuracy')],
  )

#### CrossValidation

In [51]:
def CV_Deep_Binary_model(hparams,logdir, k, current_best, d, w):
    indiceslist = []
    
    for i in range(k-1):
        indices = tf.range(i * ((tf.shape(x_train_shuffle)[0])//k), (i + 1)* ((tf.shape(x_train_shuffle)[0])//k),1).numpy().tolist()
        indiceslist.append([indices])
        
    accuracy = 0
    # combining whatever remaining after k-1 splits (to account if length of dataset is not divisible by k)
    
    indices = tf.range((k-1) * ((tf.shape(x_train_shuffle)[0])//k), (tf.shape(x_train_shuffle)[0]),1).numpy().tolist()
    
    indiceslist.append([indices]) ## indiceslist to divide train and validate
    
    for i in range(k):
        print("\nSplit no",i+1)
        model = keras.Sequential()
        for _ in range(hparams[HP_DEPTH]):
            model.add(keras.layers.Dense(hparams[HP_WIDTH],activation='relu'))
        model.add(keras.layers.Dense(1))
        model.compile(optimizer = 'sgd',
        loss=keras.losses.BinaryCrossentropy(from_logits=True),
        metrics=[keras.metrics.AUC(from_logits=True),keras.metrics.BinaryAccuracy()])
        
        b = indiceslist[i][0] 
        a = [int(item) for item in b]

        x_validate = tf.gather(x_train_shuffle,a)

        y_validate = tf.gather(y_train_shuffle,a)
        
        z = []
        
        for j in range(k):
            
            if j != i:                     ## Make sure validation and train set are different
                z =  z + indiceslist[j][0]
                
        a = [int(item) for item in z]
        
        x_train = tf.gather(x_train_shuffle, a)
        y_train = tf.gather(y_train_shuffle, a)
        
        print("\nTraining")
        
        history = model.fit(x_train, y_train, epochs= 10,validation_data = (x_validate,y_validate), verbose = 2)
        if np.max(history.history["val_binary_accuracy"]) > current_best:
                  current_best = np.max(history.history["val_binary_accuracy"])
                  d = hparams[HP_DEPTH]
                  w = hparams[HP_WIDTH]
        accuracy = accuracy + np.max(history.history["val_binary_accuracy"])
    
    return accuracy/k, d, w, current_best

#### Tuning

In [52]:
k =3 
current_best = 0
d,w = 0,0
for hp_width in HP_WIDTH.domain.values:
  for hp_depth in (HP_DEPTH.domain.values):
    hparams = {
        HP_WIDTH: hp_width,
        HP_DEPTH: hp_depth,
    }
    run_name = f"run-WIDTH{int(hparams[HP_WIDTH])}-DEPTH{hparams[HP_DEPTH]}"
    print('--- Starting trial: %s' % run_name)
    print({h.name: hparams[h] for h in hparams})

    run_dir = 'logs1483/hparams_tuning/' + run_name
    accuracy, d, w, current_best = CV_Deep_Binary_model(hparams,run_dir, k, current_best, d, w)

    with tf.summary.create_file_writer(run_dir).as_default():
      hp.hparams(hparams)  # record the values used in this trial
      tf.summary.scalar("Accuracy", accuracy, step=1)

--- Starting trial: run-WIDTH20-DEPTH4
{'NN_width': 20, 'NN_depth': 4}

Split no 1

Training
Epoch 1/10
30/30 - 1s - loss: 0.6996 - auc_22: 0.5756 - binary_accuracy: 0.5000 - val_loss: 0.6939 - val_auc_22: 0.7352 - val_binary_accuracy: 0.4657 - 1s/epoch - 35ms/step
Epoch 2/10
30/30 - 0s - loss: 0.6783 - auc_22: 0.7761 - binary_accuracy: 0.5214 - val_loss: 0.6814 - val_auc_22: 0.7619 - val_binary_accuracy: 0.4764 - 259ms/epoch - 9ms/step
Epoch 3/10
30/30 - 0s - loss: 0.6604 - auc_22: 0.8210 - binary_accuracy: 0.5343 - val_loss: 0.6745 - val_auc_22: 0.8063 - val_binary_accuracy: 0.4785 - 252ms/epoch - 8ms/step
Epoch 4/10
30/30 - 0s - loss: 0.6508 - auc_22: 0.8708 - binary_accuracy: 0.5353 - val_loss: 0.6659 - val_auc_22: 0.8702 - val_binary_accuracy: 0.4807 - 252ms/epoch - 8ms/step
Epoch 5/10
30/30 - 0s - loss: 0.6392 - auc_22: 0.8994 - binary_accuracy: 0.5353 - val_loss: 0.6625 - val_auc_22: 0.8592 - val_binary_accuracy: 0.4850 - 256ms/epoch - 9ms/step
Epoch 6/10
30/30 - 0s - loss: 0.62

Epoch 7/10
30/30 - 0s - loss: 0.5875 - auc_26: 0.8904 - binary_accuracy: 0.5193 - val_loss: 0.5992 - val_auc_26: 0.8945 - val_binary_accuracy: 0.5322 - 257ms/epoch - 9ms/step
Epoch 8/10
30/30 - 0s - loss: 0.5450 - auc_26: 0.9168 - binary_accuracy: 0.6017 - val_loss: 0.6081 - val_auc_26: 0.8389 - val_binary_accuracy: 0.7554 - 277ms/epoch - 9ms/step
Epoch 9/10
30/30 - 0s - loss: 0.4909 - auc_26: 0.9253 - binary_accuracy: 0.7784 - val_loss: 0.4911 - val_auc_26: 0.9007 - val_binary_accuracy: 0.7961 - 258ms/epoch - 9ms/step
Epoch 10/10
30/30 - 0s - loss: 0.4266 - auc_26: 0.9318 - binary_accuracy: 0.8158 - val_loss: 0.4343 - val_auc_26: 0.9133 - val_binary_accuracy: 0.7747 - 242ms/epoch - 8ms/step

Split no 3

Training
Epoch 1/10
30/30 - 2s - loss: 0.7083 - auc_27: 0.4955 - binary_accuracy: 0.4775 - val_loss: 0.6976 - val_auc_27: 0.6954 - val_binary_accuracy: 0.4893 - 2s/epoch - 62ms/step
Epoch 2/10
30/30 - 0s - loss: 0.6900 - auc_27: 0.7704 - binary_accuracy: 0.4936 - val_loss: 0.6903 - val

Epoch 3/10
30/30 - 0s - loss: 0.6440 - auc_31: 0.8863 - binary_accuracy: 0.5385 - val_loss: 0.6527 - val_auc_31: 0.8681 - val_binary_accuracy: 0.5000 - 275ms/epoch - 9ms/step
Epoch 4/10
30/30 - 0s - loss: 0.6313 - auc_31: 0.9224 - binary_accuracy: 0.5385 - val_loss: 0.6381 - val_auc_31: 0.8728 - val_binary_accuracy: 0.4957 - 280ms/epoch - 9ms/step
Epoch 5/10
30/30 - 0s - loss: 0.6168 - auc_31: 0.9345 - binary_accuracy: 0.5418 - val_loss: 0.6275 - val_auc_31: 0.8823 - val_binary_accuracy: 0.5000 - 292ms/epoch - 10ms/step
Epoch 6/10
30/30 - 0s - loss: 0.5990 - auc_31: 0.9431 - binary_accuracy: 0.5749 - val_loss: 0.6019 - val_auc_31: 0.8954 - val_binary_accuracy: 0.5343 - 305ms/epoch - 10ms/step
Epoch 7/10
30/30 - 0s - loss: 0.5768 - auc_31: 0.9463 - binary_accuracy: 0.6788 - val_loss: 0.5875 - val_auc_31: 0.9071 - val_binary_accuracy: 0.7618 - 340ms/epoch - 11ms/step
Epoch 8/10
30/30 - 0s - loss: 0.5529 - auc_31: 0.9448 - binary_accuracy: 0.8255 - val_loss: 0.5593 - val_auc_31: 0.9101 - 

Epoch 9/10
30/30 - 0s - loss: 0.5610 - auc_35: 0.9286 - binary_accuracy: 0.5535 - val_loss: 0.5997 - val_auc_35: 0.9019 - val_binary_accuracy: 0.5300 - 374ms/epoch - 12ms/step
Epoch 10/10
30/30 - 0s - loss: 0.5178 - auc_35: 0.9362 - binary_accuracy: 0.7441 - val_loss: 0.5583 - val_auc_35: 0.8902 - val_binary_accuracy: 0.7597 - 340ms/epoch - 11ms/step

Split no 3

Training
Epoch 1/10
30/30 - 2s - loss: 0.6885 - auc_36: 0.7058 - binary_accuracy: 0.4914 - val_loss: 0.6820 - val_auc_36: 0.7739 - val_binary_accuracy: 0.5064 - 2s/epoch - 83ms/step
Epoch 2/10
30/30 - 0s - loss: 0.6500 - auc_36: 0.8712 - binary_accuracy: 0.5215 - val_loss: 0.6756 - val_auc_36: 0.7864 - val_binary_accuracy: 0.5064 - 291ms/epoch - 10ms/step
Epoch 3/10
30/30 - 0s - loss: 0.6339 - auc_36: 0.8717 - binary_accuracy: 0.5215 - val_loss: 0.6631 - val_auc_36: 0.8544 - val_binary_accuracy: 0.5064 - 310ms/epoch - 10ms/step
Epoch 4/10
30/30 - 0s - loss: 0.6162 - auc_36: 0.8907 - binary_accuracy: 0.5215 - val_loss: 0.6424 -

Epoch 5/10
30/30 - 0s - loss: 0.5384 - auc_40: 0.9230 - binary_accuracy: 0.6756 - val_loss: 0.5717 - val_auc_40: 0.8779 - val_binary_accuracy: 0.5536 - 304ms/epoch - 10ms/step
Epoch 6/10
30/30 - 0s - loss: 0.4811 - auc_40: 0.9473 - binary_accuracy: 0.7752 - val_loss: 0.5179 - val_auc_40: 0.8907 - val_binary_accuracy: 0.7618 - 358ms/epoch - 12ms/step
Epoch 7/10
30/30 - 0s - loss: 0.4168 - auc_40: 0.9541 - binary_accuracy: 0.8469 - val_loss: 0.5603 - val_auc_40: 0.8849 - val_binary_accuracy: 0.5773 - 336ms/epoch - 11ms/step
Epoch 8/10
30/30 - 0s - loss: 0.3660 - auc_40: 0.9488 - binary_accuracy: 0.8544 - val_loss: 0.6293 - val_auc_40: 0.8834 - val_binary_accuracy: 0.7961 - 328ms/epoch - 11ms/step
Epoch 9/10
30/30 - 0s - loss: 0.3204 - auc_40: 0.9527 - binary_accuracy: 0.8662 - val_loss: 0.5161 - val_auc_40: 0.8939 - val_binary_accuracy: 0.8047 - 330ms/epoch - 11ms/step
Epoch 10/10
30/30 - 0s - loss: 0.2950 - auc_40: 0.9497 - binary_accuracy: 0.8630 - val_loss: 0.4489 - val_auc_40: 0.8934


Split no 3

Training
Epoch 1/10
30/30 - 1s - loss: 0.7207 - auc_45: 0.6442 - binary_accuracy: 0.4764 - val_loss: 0.6937 - val_auc_45: 0.6937 - val_binary_accuracy: 0.4936 - 1s/epoch - 41ms/step
Epoch 2/10
30/30 - 0s - loss: 0.6742 - auc_45: 0.8241 - binary_accuracy: 0.5064 - val_loss: 0.6803 - val_auc_45: 0.7067 - val_binary_accuracy: 0.5021 - 317ms/epoch - 11ms/step
Epoch 3/10
30/30 - 0s - loss: 0.6478 - auc_45: 0.8357 - binary_accuracy: 0.5204 - val_loss: 0.6677 - val_auc_45: 0.8259 - val_binary_accuracy: 0.5064 - 379ms/epoch - 13ms/step
Epoch 4/10
30/30 - 0s - loss: 0.6319 - auc_45: 0.8699 - binary_accuracy: 0.5204 - val_loss: 0.6560 - val_auc_45: 0.8578 - val_binary_accuracy: 0.5064 - 346ms/epoch - 12ms/step
Epoch 5/10
30/30 - 0s - loss: 0.6131 - auc_45: 0.8871 - binary_accuracy: 0.5215 - val_loss: 0.6482 - val_auc_45: 0.8671 - val_binary_accuracy: 0.5085 - 350ms/epoch - 12ms/step
Epoch 6/10
30/30 - 0s - loss: 0.5881 - auc_45: 0.8966 - binary_accuracy: 0.5215 - val_loss: 0.6388 - 

#### Tuned Parameters

In [53]:
print("Tuned Depth for Deep NN = ",d)
print("Tuned Widhth for Deep NN = ",w)

Tuned Depth for Deep NN =  4
Tuned Widhth for Deep NN =  40


In [54]:
model = keras.Sequential( [keras.layers.Dense(40,activation='relu'),keras.layers.Dense(40,activation='relu'),
                           keras.layers.Dense(40,activation='relu'),keras.layers.Dense(40,activation='relu'),
                           keras.layers.Dense(1)] )

model.compile(optimizer = 'sgd',
    loss=keras.losses.BinaryCrossentropy(from_logits=True),
    metrics=[keras.metrics.AUC(from_logits=True),keras.metrics.BinaryAccuracy()])

model.fit(x_train,y_train, epochs = 20,validation_data=(x_validate,y_validate),verbose = 2)

Epoch 1/20
44/44 - 1s - loss: 0.6793 - auc_49: 0.7408 - binary_accuracy: 0.5071 - val_loss: 0.6584 - val_auc_49: 0.8465 - val_binary_accuracy: 0.5179 - 1s/epoch - 23ms/step
Epoch 2/20
44/44 - 0s - loss: 0.6300 - auc_49: 0.8748 - binary_accuracy: 0.5257 - val_loss: 0.6265 - val_auc_49: 0.8750 - val_binary_accuracy: 0.5212 - 435ms/epoch - 10ms/step
Epoch 3/20
44/44 - 0s - loss: 0.5882 - auc_49: 0.9162 - binary_accuracy: 0.5357 - val_loss: 0.5797 - val_auc_49: 0.9094 - val_binary_accuracy: 0.6906 - 378ms/epoch - 9ms/step
Epoch 4/20
44/44 - 0s - loss: 0.5289 - auc_49: 0.9348 - binary_accuracy: 0.7257 - val_loss: 0.5215 - val_auc_49: 0.9213 - val_binary_accuracy: 0.6938 - 366ms/epoch - 8ms/step
Epoch 5/20
44/44 - 0s - loss: 0.4508 - auc_49: 0.9416 - binary_accuracy: 0.8314 - val_loss: 0.4345 - val_auc_49: 0.9360 - val_binary_accuracy: 0.8306 - 373ms/epoch - 8ms/step
Epoch 6/20
44/44 - 0s - loss: 0.3717 - auc_49: 0.9467 - binary_accuracy: 0.8400 - val_loss: 0.3628 - val_auc_49: 0.9390 - val_

<keras.callbacks.History at 0x1ec1a73a8b0>

### Testing on test set

In [55]:
print("Evaluate on test data")
results = model.evaluate(x_test, y_test)
print("test accuracy = ", results[2]);

Evaluate on test data
test accuracy =  0.8464163541793823
