In [1]:
#importing libraries

import findspark
import pyspark
import pyspark.sql.functions as F
import pyspark
from pyspark.sql import SparkSession, SQLContext
from pyspark.ml import Pipeline,Transformer
from pyspark.ml.feature import Imputer,StandardScaler,StringIndexer,OneHotEncoder, VectorAssembler

from pyspark.sql.functions import *
from pyspark.sql.types import *
import numpy as np

from pyspark.ml.classification import LogisticRegression
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator



In [2]:
#creating spark session
findspark.init()
findspark.find()

spark = SparkSession.builder \
    .master("local[*]") \
    .appName("MQTT") \
    .getOrCreate()

spark = SparkSession.builder.appName("mqttProject").getOrCreate()
sc    = spark.sparkContext

sqlContext = SQLContext(sc)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/11/29 01:45:59 INFO org.apache.spark.SparkEnv: Registering MapOutputTracker
22/11/29 01:45:59 INFO org.apache.spark.SparkEnv: Registering BlockManagerMaster
22/11/29 01:45:59 INFO org.apache.spark.SparkEnv: Registering BlockManagerMasterHeartbeat
22/11/29 01:45:59 INFO org.apache.spark.SparkEnv: Registering OutputCommitCoordinator


In [3]:
test = spark.read.csv( ("gs://dataproc-staging-us-west3-650974721448-eojcphee/test30_augmented.csv"),header=True, inferSchema= True)
train = spark.read.csv( ("gs://dataproc-staging-us-west3-650974721448-eojcphee/train70_augmented.csv"),header=True, inferSchema= True)
DF = train.union(test)
DF = DF.toDF(*(c.replace('.', '_') for c in DF.columns))
train = train.toDF(*(c.replace('.', '_') for c in train.columns))
test = test.toDF(*(c.replace('.', '_') for c in test.columns))
train = train.drop("mqtt_hdrflags", "tcp_flags", 'mqtt_conack_flags','mqtt_conflags','mqtt_msg',
        'mqtt_protoname')
test = test.drop("mqtt_hdrflags", "tcp_flags", 'mqtt_conack_flags','mqtt_conflags','mqtt_msg',
        'mqtt_protoname')

                                                                                

In [4]:
train = train.limit(1400)
test =  test.limit(600)

In [5]:
numeric_features = [feature[0] for feature in DF.dtypes if feature[1] not in ('string')]
string_features = [feature[0] for feature in DF.dtypes if feature[1] in ('string')]
to_drop =  ["mqtt_conflag_cleansess","mqtt_proto_len","mqtt_conflag_passwd","mqtt_qos"]

In [6]:
col_names = ['tcp_time_delta','tcp_len','mqtt_conack_flags','mqtt_conack_flags_reserved','mqtt_conack_flags_sp',
 'mqtt_conack_val','mqtt_conflag_cleansess','mqtt_conflag_passwd','mqtt_conflag_qos','mqtt_conflag_reserved',
 'mqtt_conflag_retain','mqtt_conflag_uname','mqtt_conflag_willflag','mqtt_conflags','mqtt_dupflag', 
 'mqtt_kalive', 'mqtt_len','mqtt_msg','mqtt_msgid', 'mqtt_msgtype', 'mqtt_proto_len', 'mqtt_qos', 'mqtt_retain',
 'mqtt_sub_qos', 'mqtt_suback_qos', 'mqtt_ver', 'mqtt_willmsg', 'mqtt_willmsg_len', 'mqtt_willtopic', 'mqtt_willtopic_len',
 'target']

# nominal_cols = ['mqtt_conack_flags','mqtt_conflags', 'mqtt_msg', 'mqtt_protoname']
nominal_cols = []

continuous_cols = ['tcp_time_delta', 'tcp_len', 'mqtt_conack_flags_reserved', 'mqtt_conack_flags_sp', 'mqtt_conack_val',
 'mqtt_conflag_cleansess', 'mqtt_conflag_passwd', 'mqtt_conflag_qos', 'mqtt_conflag_reserved', 'mqtt_conflag_retain',
 'mqtt_conflag_uname', 'mqtt_conflag_willflag', 'mqtt_dupflag', 'mqtt_kalive', 'mqtt_len', 'mqtt_msgid',
 'mqtt_msgtype', 'mqtt_proto_len', 'mqtt_qos', 'mqtt_retain', 'mqtt_sub_qos', 'mqtt_suback_qos', 'mqtt_ver',
 'mqtt_willmsg', 'mqtt_willmsg_len', 'mqtt_willtopic', 'mqtt_willtopic_len']

In [7]:
class OutcomeCreater_binary(Transformer): # this defines a transformer that creates the outcome column
    
    def __init__(self):
        super().__init__()
  
    def _transform(self, dataset):
        label_to_binary = udf(lambda name: 0.0 if name == 'legitimate' else 1.0)
        output_df = dataset.withColumn('outcome', label_to_binary(col('target'))).drop("target")  
        output_df = output_df.withColumn('outcome', col('outcome').cast(DoubleType()))
        return output_df
    
class OutcomeCreater_multi(Transformer): # this defines a transformer that creates the outcome column
    
    def __init__(self):
        super().__init__()
  
    def _transform(self, dataset):
        label_to_multiple = udf(lambda name: 0.0 if name == 'legitimate' else (1.0 if name == "flood" else(2.0 if name == "dos" else(3.0 if name == "bruteforce" else(4.0 if name == "slowite" else (5.0))))))
        output_df = dataset.withColumn('outcome', label_to_multiple(col('target'))).drop("target")  
        output_df = output_df.withColumn('outcome', col('outcome').cast(DoubleType()))
        return output_df
        
class FeatureTypeCaster(Transformer): # this transformer will cast the columns as appropriate types  
    def __init__(self):
        super().__init__()

    def _transform(self, dataset):
        output_df = dataset
        for col_name in continuous_cols:
            output_df = output_df.withColumn(col_name,col(col_name).cast(DoubleType()))

        return output_df
    
class ColumnDropper(Transformer): # this transformer drops unnecessary columns
    def __init__(self, columns_to_drop = None):
        super().__init__()
        self.columns_to_drop=columns_to_drop
    def _transform(self, dataset):
        output_df = dataset
        for col_name in self.columns_to_drop:
            output_df = output_df.drop(col_name)
            
        return output_df
    
def get_preprocess_pipeline(classification):
    # Stage where columns are casted as appropriate types
    stage_typecaster = FeatureTypeCaster()

    # Stage where nominal columns are transformed to index columns using StringIndexer
    nominal_id_cols = [x+"_index" for x in nominal_cols]
    nominal_onehot_cols = [x+"_encoded" for x in nominal_cols]
    stage_nominal_indexer = StringIndexer(inputCols = nominal_cols, outputCols = nominal_id_cols )
    
    # Stage where the index columns are further transformed using OneHotEncoder
    stage_nominal_onehot_encoder = OneHotEncoder(inputCols=nominal_id_cols, outputCols=nominal_onehot_cols)

    # Stage where all relevant features are assembled into a vector (and dropping a few)
    feature_cols = continuous_cols+nominal_onehot_cols
    corelated_cols_to_remove = to_drop
    
    for col_name in corelated_cols_to_remove:
        feature_cols.remove(col_name)
    stage_vector_assembler = VectorAssembler(inputCols=feature_cols, outputCol="vectorized_features")

    # Stage where we scale the columns
    stage_scaler = StandardScaler(inputCol= 'vectorized_features', outputCol= 'features')
    

    # Stage for creating the outcome column representing whether there is attack
    if(classification == "binary"): 
        stage_outcome = OutcomeCreater_binary()
    else:
        stage_outcome = OutcomeCreater_multi()

    # Removing all unnecessary columbs, only keeping the 'features' and 'outcome' columns
    stage_column_dropper = ColumnDropper(columns_to_drop = nominal_cols+nominal_id_cols+
        nominal_onehot_cols + continuous_cols + ['vectorized_features'])
    
    pipeline = Pipeline(stages=[stage_typecaster,stage_nominal_indexer,stage_nominal_onehot_encoder,
        stage_vector_assembler,stage_scaler,stage_outcome,stage_column_dropper])
    
    return pipeline 

### Multiclass classification

In [8]:
preprocess_pipeline = get_preprocess_pipeline("multi")
preprocess_pipeline_model = preprocess_pipeline.fit(train)

train_df = preprocess_pipeline_model.transform(train)
test_df = preprocess_pipeline_model.transform(test)

22/11/29 01:47:10 WARN org.apache.spark.sql.catalyst.util.package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

In [9]:
# train_df= train_df.limit(1000)
# test_df = test_df.limit(400)

In [10]:
train_df.show(15)



+--------------------+-------+
|            features|outcome|
+--------------------+-------+
|(23,[0,1,12,14],[...|    0.0|
|(23,[0],[4.436988...|    3.0|
|(23,[0],[1.774795...|    1.0|
|(23,[0],[1.774795...|    4.0|
|(23,[0,1,12,14],[...|    1.0|
|(23,[0,1,12,14],[...|    0.0|
|(23,[0,1,12,14],[...|    0.0|
|(23,[0],[2.440343...|    3.0|
|(23,[0,1,12,14],[...|    0.0|
|(23,[0,1,8,11,12,...|    3.0|
|(23,[1,12,13,14],...|    2.0|
|(23,[0,1,12,14],[...|    0.0|
|(23,[0,1,12,14],[...|    0.0|
|(23,[0],[2.795302...|    5.0|
|(23,[0],[4.436988...|    2.0|
+--------------------+-------+
only showing top 15 rows



                                                                                

In [11]:
to_array = udf(lambda v: v.toArray().tolist(), ArrayType(FloatType()))

df_train = train_df
df_validate,df_test = test_df.randomSplit([0.5,0.5])

In [12]:
df_train.show(15)

+--------------------+-------+
|            features|outcome|
+--------------------+-------+
|(23,[0,1,12,14],[...|    0.0|
|(23,[0],[4.436988...|    3.0|
|(23,[0],[1.774795...|    1.0|
|(23,[0],[1.774795...|    4.0|
|(23,[0,1,12,14],[...|    1.0|
|(23,[0,1,12,14],[...|    0.0|
|(23,[0,1,12,14],[...|    0.0|
|(23,[0],[2.440343...|    3.0|
|(23,[0,1,12,14],[...|    0.0|
|(23,[0,1,8,11,12,...|    3.0|
|(23,[1,12,13,14],...|    2.0|
|(23,[0,1,12,14],[...|    0.0|
|(23,[0,1,12,14],[...|    0.0|
|(23,[0],[2.795302...|    5.0|
|(23,[0],[4.436988...|    2.0|
+--------------------+-------+
only showing top 15 rows



In [13]:
df_train_pandas = df_train.withColumn('features', to_array('features')).toPandas()



In [14]:
df_validate_pandas = df_validate.withColumn('features', to_array('features')).toPandas()
df_test_pandas = df_test.withColumn('features', to_array('features')).toPandas()



In [15]:
# pip install tensorflow

In [16]:
import tensorflow as tf
from tensorflow import keras 

# Converting the pandas DataFrame to tensors
# Note we are using 3 data sets train, validate, test

x_train = tf.constant(np.array(df_train_pandas['features'].values.tolist()))
y_train = tf.constant(np.array(df_train_pandas['outcome'].values.tolist()))

x_validate = tf.constant(np.array(df_validate_pandas['features'].values.tolist()))
y_validate = tf.constant(np.array(df_validate_pandas['outcome'].values.tolist()))


x_test = tf.constant(np.array(df_test_pandas['features'].values.tolist()))
y_test = tf.constant(np.array(df_test_pandas['outcome'].values.tolist()))

2022-11-29 01:47:19.151499: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-29 01:47:19.335629: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-11-29 01:47:19.335670: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-11-29 01:47:20.158781: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directo

### Shallow NN

In [17]:
import datetime
model_multiclass = keras.Sequential( [keras.layers.Dense(30,activation='relu'),
                           keras.layers.Dense(6)] )

model_multiclass.compile(optimizer = 'sgd',
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=[keras.metrics.SparseCategoricalAccuracy()])

log_dir = "logss/multiiclassfin/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

model_multiclass.fit(x_train,y_train, epochs = 20,validation_data=(x_validate,y_validate),verbose = 2, callbacks=[tensorboard_callback])

Epoch 1/20
44/44 - 1s - loss: 1.6856 - sparse_categorical_accuracy: 0.4757 - val_loss: 1.5728 - val_sparse_categorical_accuracy: 0.4320 - 822ms/epoch - 19ms/step
Epoch 2/20
44/44 - 0s - loss: 1.4877 - sparse_categorical_accuracy: 0.4986 - val_loss: 1.3901 - val_sparse_categorical_accuracy: 0.4932 - 122ms/epoch - 3ms/step
Epoch 3/20
44/44 - 0s - loss: 1.3551 - sparse_categorical_accuracy: 0.5521 - val_loss: 1.2726 - val_sparse_categorical_accuracy: 0.5816 - 131ms/epoch - 3ms/step
Epoch 4/20
44/44 - 0s - loss: 1.2656 - sparse_categorical_accuracy: 0.5850 - val_loss: 1.1938 - val_sparse_categorical_accuracy: 0.6020 - 117ms/epoch - 3ms/step
Epoch 5/20
44/44 - 0s - loss: 1.2036 - sparse_categorical_accuracy: 0.5943 - val_loss: 1.1403 - val_sparse_categorical_accuracy: 0.6054 - 117ms/epoch - 3ms/step
Epoch 6/20
44/44 - 0s - loss: 1.1596 - sparse_categorical_accuracy: 0.5950 - val_loss: 1.1032 - val_sparse_categorical_accuracy: 0.6054 - 129ms/epoch - 3ms/step
Epoch 7/20
44/44 - 0s - loss: 1.1

<keras.callbacks.History at 0x7f1c143d6730>

#### Hyper parameter tuning with cross validation

##### Shuffling

In [18]:
import tensorflow as tf
from tensorflow import keras 
train = tf.concat([x_train,tf.reshape(y_train,[-1,1])],1)
train_shuffle = tf.random.shuffle(train)
x_train_shuffle = train_shuffle[:,0:tf.shape(x_train)[1]]
y_train_shuffle = train_shuffle[:,tf.shape(x_train)[1]]

##### Hyperparameters

In [19]:
from tensorboard.plugins.hparams import api as hp

HP_WIDTH = hp.HParam('NN_width', hp.Discrete([20,30,40]))
HP_DEPTH = hp.HParam('NN_depth', hp.Discrete([1,2]))


with tf.summary.create_file_writer('logs1483/hparams_tuning').as_default():
  hp.hparams_config(
    hparams=[HP_WIDTH, HP_DEPTH],
    metrics=[hp.Metric('Accuracy')],
  )

##### CrossValidation

In [20]:
def CV_model(hparams,logdir, k, current_best, d, w):
    indiceslist = []
    
    for i in range(k-1):
        indices = tf.range(i * ((tf.shape(x_train_shuffle)[0])//k), (i + 1)* ((tf.shape(x_train_shuffle)[0])//k),1).numpy().tolist()
        indiceslist.append([indices])
        
    accuracy = 0
    # combining whatever remaining after k-1 splits (to account if length of dataset is not divisible by k)
    
    indices = tf.range((k-1) * ((tf.shape(x_train_shuffle)[0])//k), (tf.shape(x_train_shuffle)[0]),1).numpy().tolist()
    
    indiceslist.append([indices]) ## indiceslist to divide train and validate
    
    for i in range(k):
        print("\nSplit no",i+1)
        model = keras.Sequential()
        for _ in range(hparams[HP_DEPTH]):
            model.add(keras.layers.Dense(hparams[HP_WIDTH],activation='relu'))
        model.add(keras.layers.Dense(6))
        model.compile(
          optimizer=keras.optimizers.SGD(),
          loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True),
          metrics=[keras.metrics.SparseCategoricalAccuracy()])
        
        b = indiceslist[i][0] 
        a = [int(item) for item in b]

        x_validate = tf.gather(x_train_shuffle,a)

        y_validate = tf.gather(y_train_shuffle,a)
        
        z = []
        
        for j in range(k):
            
            if j != i:                     ## Make sure validation and train set are different
                z =  z + indiceslist[j][0]
                
        a = [int(item) for item in z]
        
        x_train = tf.gather(x_train_shuffle, a)
        y_train = tf.gather(y_train_shuffle, a)
        
        print("\nTraining")
        
        history = model.fit(x_train, y_train, epochs= 10,validation_data = (x_validate,y_validate), verbose = 2)
        if np.max(history.history["val_sparse_categorical_accuracy"]) > current_best:
                  current_best = np.max(history.history["val_sparse_categorical_accuracy"])
                  d = hparams[HP_DEPTH]
                  w = hparams[HP_WIDTH]
        accuracy = accuracy + np.max(history.history["val_sparse_categorical_accuracy"])
    
    return accuracy/k, d, w, current_best

In [21]:
k =3 
current_best = 0
d,w = 0,0
for hp_width in HP_WIDTH.domain.values:
  for hp_depth in (HP_DEPTH.domain.values):
    hparams = {
        HP_WIDTH: hp_width,
        HP_DEPTH: hp_depth,
    }
    run_name = f"run-WIDTH{int(hparams[HP_WIDTH])}-DEPTH{hparams[HP_DEPTH]}"
    print('--- Starting trial: %s' % run_name)
    print({h.name: hparams[h] for h in hparams})

    run_dir = 'logs1483/hparams_tuning/' + run_name
    accuracy, d, w, current_best = CV_model(hparams,run_dir, k, current_best, d, w)

    with tf.summary.create_file_writer(run_dir).as_default():
      hp.hparams(hparams)  # record the values used in this trial
      tf.summary.scalar("Accuracy", accuracy, step=1)

--- Starting trial: run-WIDTH20-DEPTH1
{'NN_width': 20, 'NN_depth': 1}

Split no 1

Training
Epoch 1/10
30/30 - 1s - loss: 1.9153 - sparse_categorical_accuracy: 0.1895 - val_loss: 1.8579 - val_sparse_categorical_accuracy: 0.1695 - 582ms/epoch - 19ms/step
Epoch 2/10
30/30 - 0s - loss: 1.7572 - sparse_categorical_accuracy: 0.4411 - val_loss: 1.7239 - val_sparse_categorical_accuracy: 0.4742 - 82ms/epoch - 3ms/step
Epoch 3/10
30/30 - 0s - loss: 1.6286 - sparse_categorical_accuracy: 0.5161 - val_loss: 1.6168 - val_sparse_categorical_accuracy: 0.4914 - 79ms/epoch - 3ms/step
Epoch 4/10
30/30 - 0s - loss: 1.5262 - sparse_categorical_accuracy: 0.5257 - val_loss: 1.5317 - val_sparse_categorical_accuracy: 0.5043 - 87ms/epoch - 3ms/step
Epoch 5/10
30/30 - 0s - loss: 1.4452 - sparse_categorical_accuracy: 0.5321 - val_loss: 1.4606 - val_sparse_categorical_accuracy: 0.5150 - 81ms/epoch - 3ms/step
Epoch 6/10
30/30 - 0s - loss: 1.3779 - sparse_categorical_accuracy: 0.5396 - val_loss: 1.4026 - val_spars

##### Best parameters

In [22]:
print("Tuned Depth for shallow NN = ",d)
print("Tuned Widhth for shallow NN = ",w)

Tuned Depth for shallow NN =  1
Tuned Widhth for shallow NN =  30


#### Tuned Hyper parameters

In [23]:
model_multiclass = keras.Sequential( [keras.layers.Dense(30,activation='relu'),
                           keras.layers.Dense(6)] )

model_multiclass.compile(optimizer = 'sgd',
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=[keras.metrics.SparseCategoricalAccuracy()])

log_dir = "logss/multiiclassfin/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

model_multiclass.fit(x_train,y_train, epochs = 20,validation_data=(x_validate,y_validate),verbose = 2, callbacks=[tensorboard_callback])

Epoch 1/20
44/44 - 1s - loss: 1.8622 - sparse_categorical_accuracy: 0.2586 - val_loss: 1.6694 - val_sparse_categorical_accuracy: 0.5510 - 592ms/epoch - 13ms/step
Epoch 2/20
44/44 - 0s - loss: 1.5765 - sparse_categorical_accuracy: 0.5721 - val_loss: 1.4691 - val_sparse_categorical_accuracy: 0.5578 - 115ms/epoch - 3ms/step
Epoch 3/20
44/44 - 0s - loss: 1.4127 - sparse_categorical_accuracy: 0.5729 - val_loss: 1.3529 - val_sparse_categorical_accuracy: 0.5816 - 116ms/epoch - 3ms/step
Epoch 4/20
44/44 - 0s - loss: 1.3149 - sparse_categorical_accuracy: 0.5764 - val_loss: 1.2729 - val_sparse_categorical_accuracy: 0.5816 - 120ms/epoch - 3ms/step
Epoch 5/20
44/44 - 0s - loss: 1.2478 - sparse_categorical_accuracy: 0.5764 - val_loss: 1.2118 - val_sparse_categorical_accuracy: 0.5918 - 121ms/epoch - 3ms/step
Epoch 6/20
44/44 - 0s - loss: 1.1987 - sparse_categorical_accuracy: 0.5886 - val_loss: 1.1649 - val_sparse_categorical_accuracy: 0.5918 - 121ms/epoch - 3ms/step
Epoch 7/20
44/44 - 0s - loss: 1.1

<keras.callbacks.History at 0x7f1bfd488040>

#### Evaluate on Test set

In [24]:
print("Evaluate on test data")
results = model_multiclass.evaluate(x_test, y_test)
print("test accuracy = ", results[1]);

Evaluate on test data
test accuracy =  0.6339869499206543


### Deep NN

In [25]:
import datetime
model_multiclass = keras.Sequential( [keras.layers.Dense(30,activation='relu'),keras.layers.Dense(30,activation='relu'),
                                      keras.layers.Dense(20,activation='relu'),keras.layers.Dense(20,activation='relu'),
                           keras.layers.Dense(6)] )

model_multiclass.compile(optimizer = 'sgd',
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=[keras.metrics.SparseCategoricalAccuracy()])

log_dir = "logss/multiiclassfin/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

model_multiclass.fit(x_train,y_train, epochs = 20,validation_data=(x_validate,y_validate),verbose = 2, callbacks=[tensorboard_callback])

Epoch 1/20
44/44 - 1s - loss: 1.7333 - sparse_categorical_accuracy: 0.4200 - val_loss: 1.6739 - val_sparse_categorical_accuracy: 0.4490 - 865ms/epoch - 20ms/step
Epoch 2/20
44/44 - 0s - loss: 1.5744 - sparse_categorical_accuracy: 0.4871 - val_loss: 1.5574 - val_sparse_categorical_accuracy: 0.4490 - 161ms/epoch - 4ms/step
Epoch 3/20
44/44 - 0s - loss: 1.4708 - sparse_categorical_accuracy: 0.4871 - val_loss: 1.4633 - val_sparse_categorical_accuracy: 0.4490 - 158ms/epoch - 4ms/step
Epoch 4/20
44/44 - 0s - loss: 1.3934 - sparse_categorical_accuracy: 0.4971 - val_loss: 1.3851 - val_sparse_categorical_accuracy: 0.4762 - 167ms/epoch - 4ms/step
Epoch 5/20
44/44 - 0s - loss: 1.3298 - sparse_categorical_accuracy: 0.5186 - val_loss: 1.3115 - val_sparse_categorical_accuracy: 0.5102 - 155ms/epoch - 4ms/step
Epoch 6/20
44/44 - 0s - loss: 1.2727 - sparse_categorical_accuracy: 0.5357 - val_loss: 1.2433 - val_sparse_categorical_accuracy: 0.5442 - 161ms/epoch - 4ms/step
Epoch 7/20
44/44 - 0s - loss: 1.2

<keras.callbacks.History at 0x7f1bfd3a24f0>

##### Cross validation function

In [26]:
def CV_Deep_model(hparams,logdir, k, current_best, d, w):
    indiceslist = []
    
    for i in range(k-1):
        indices = tf.range(i * ((tf.shape(x_train_shuffle)[0])//k), (i + 1)* ((tf.shape(x_train_shuffle)[0])//k),1).numpy().tolist()
        indiceslist.append([indices])
        
    accuracy = 0
    # combining whatever remaining after k-1 splits (to account if length of dataset is not divisible by k)
    
    indices = tf.range((k-1) * ((tf.shape(x_train_shuffle)[0])//k), (tf.shape(x_train_shuffle)[0]),1).numpy().tolist()
    
    indiceslist.append([indices]) ## indiceslist to divide train and validate
    
    for i in range(k):
        print("\nSplit no",i+1)
        model = keras.Sequential()
        for _ in range(hparams[HP_DEPTH]):
            model.add(keras.layers.Dense(hparams[HP_WIDTH],activation='relu'))
        model.add(keras.layers.Dense(6))
        model.compile(
          optimizer=keras.optimizers.SGD(),
          loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True),
          metrics=[keras.metrics.SparseCategoricalAccuracy()])
        
        b = indiceslist[i][0] 
        a = [int(item) for item in b]

        x_validate = tf.gather(x_train_shuffle,a)

        y_validate = tf.gather(y_train_shuffle,a)
        
        z = []
        
        for j in range(k):
            
            if j != i:                     ## Make sure validation and train set are different
                z =  z + indiceslist[j][0]
                
        a = [int(item) for item in z]
        
        x_train = tf.gather(x_train_shuffle, a)
        y_train = tf.gather(y_train_shuffle, a)
        
        print("\nTraining")
        
        history = model.fit(x_train, y_train, epochs= 10,validation_data = (x_validate,y_validate), verbose = 2)
        if np.max(history.history["val_sparse_categorical_accuracy"]) > current_best:
                  current_best = np.max(history.history["val_sparse_categorical_accuracy"])
                  d = hparams[HP_DEPTH]
                  w = hparams[HP_WIDTH]
        accuracy = accuracy + np.max(history.history["val_sparse_categorical_accuracy"])
    
    return accuracy/k, d, w, current_best

#### Shuffling

In [27]:
train = tf.concat([x_train,tf.reshape(y_train,[-1,1])],1)
train_shuffle = tf.random.shuffle(train)
x_train_shuffle = train_shuffle[:,0:tf.shape(x_train)[1]]
y_train_shuffle = train_shuffle[:,tf.shape(x_train)[1]]

##### Hyperparameters

In [28]:
from tensorboard.plugins.hparams import api as hp

HP_WIDTH = hp.HParam('NN_width', hp.Discrete([20,30,40]))
HP_DEPTH = hp.HParam('NN_depth', hp.Discrete([4,5,6]))


with tf.summary.create_file_writer('logs1483/hparams_tuning').as_default():
  hp.hparams_config(
    hparams=[HP_WIDTH, HP_DEPTH],
    metrics=[hp.Metric('Accuracy')],
  )

In [29]:
k =3 
current_best = 0
d,w = 0,0
for hp_width in HP_WIDTH.domain.values:
  for hp_depth in (HP_DEPTH.domain.values):
    hparams = {
        HP_WIDTH: hp_width,
        HP_DEPTH: hp_depth,
    }
    run_name = f"run-WIDTH{int(hparams[HP_WIDTH])}-DEPTH{hparams[HP_DEPTH]}"
    print('--- Starting trial: %s' % run_name)
    print({h.name: hparams[h] for h in hparams})

    run_dir = 'logs1483/hparams_tuning/' + run_name
    accuracy, d, w, current_best = CV_Deep_model(hparams,run_dir, k, current_best, d, w)

    with tf.summary.create_file_writer(run_dir).as_default():
      hp.hparams(hparams)  # record the values used in this trial
      tf.summary.scalar("Accuracy", accuracy, step=1)

--- Starting trial: run-WIDTH20-DEPTH4
{'NN_width': 20, 'NN_depth': 4}

Split no 1

Training
Epoch 1/10
30/30 - 1s - loss: 1.8199 - sparse_categorical_accuracy: 0.3019 - val_loss: 1.7505 - val_sparse_categorical_accuracy: 0.4635 - 1s/epoch - 38ms/step
Epoch 2/10
30/30 - 0s - loss: 1.6743 - sparse_categorical_accuracy: 0.5032 - val_loss: 1.6557 - val_sparse_categorical_accuracy: 0.4227 - 86ms/epoch - 3ms/step
Epoch 3/10
30/30 - 0s - loss: 1.5750 - sparse_categorical_accuracy: 0.5064 - val_loss: 1.5886 - val_sparse_categorical_accuracy: 0.4227 - 86ms/epoch - 3ms/step
Epoch 4/10
30/30 - 0s - loss: 1.4981 - sparse_categorical_accuracy: 0.5075 - val_loss: 1.5343 - val_sparse_categorical_accuracy: 0.4292 - 88ms/epoch - 3ms/step
Epoch 5/10
30/30 - 0s - loss: 1.4324 - sparse_categorical_accuracy: 0.5193 - val_loss: 1.4866 - val_sparse_categorical_accuracy: 0.4421 - 94ms/epoch - 3ms/step
Epoch 6/10
30/30 - 0s - loss: 1.3765 - sparse_categorical_accuracy: 0.5268 - val_loss: 1.4485 - val_sparse_c

### Best Parameters

In [30]:
print("Tuned Depth for Deep NN = ",d)
print("Tuned Widhth for Deep NN = ",w)

Tuned Depth for Deep NN =  4
Tuned Widhth for Deep NN =  40


In [31]:
model_multiclass = keras.Sequential( [keras.layers.Dense(40,activation='relu'),keras.layers.Dense(40,activation='relu'),
                           keras.layers.Dense(40,activation='relu'), keras.layers.Dense(40,activation='relu'),
                           keras.layers.Dense(6)] )

model_multiclass.compile(optimizer = 'sgd',
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=[keras.metrics.SparseCategoricalAccuracy()])

log_dir = "logss/multiiclassfin/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

model_multiclass.fit(x_train,y_train, epochs = 20,validation_data=(x_validate,y_validate),verbose = 2, callbacks=[tensorboard_callback])

Epoch 1/20
44/44 - 1s - loss: 1.7127 - sparse_categorical_accuracy: 0.4179 - val_loss: 1.6267 - val_sparse_categorical_accuracy: 0.5646 - 840ms/epoch - 19ms/step
Epoch 2/20
44/44 - 0s - loss: 1.5698 - sparse_categorical_accuracy: 0.5671 - val_loss: 1.5112 - val_sparse_categorical_accuracy: 0.5442 - 173ms/epoch - 4ms/step
Epoch 3/20
44/44 - 0s - loss: 1.4564 - sparse_categorical_accuracy: 0.5586 - val_loss: 1.4047 - val_sparse_categorical_accuracy: 0.5442 - 158ms/epoch - 4ms/step
Epoch 4/20
44/44 - 0s - loss: 1.3556 - sparse_categorical_accuracy: 0.5764 - val_loss: 1.3061 - val_sparse_categorical_accuracy: 0.5748 - 168ms/epoch - 4ms/step
Epoch 5/20
44/44 - 0s - loss: 1.2702 - sparse_categorical_accuracy: 0.5929 - val_loss: 1.2257 - val_sparse_categorical_accuracy: 0.5986 - 158ms/epoch - 4ms/step
Epoch 6/20
44/44 - 0s - loss: 1.2073 - sparse_categorical_accuracy: 0.6050 - val_loss: 1.1693 - val_sparse_categorical_accuracy: 0.6088 - 160ms/epoch - 4ms/step
Epoch 7/20
44/44 - 0s - loss: 1.1

<keras.callbacks.History at 0x7f1bfd814b50>

### Testing on Test set

In [32]:
print("Evaluate on test data")
results = model_multiclass.evaluate(x_test, y_test)
print("test accuracy = ", results[1]);

Evaluate on test data
test accuracy =  0.6405228972434998


## Binary Classification

In [33]:
test = spark.read.csv( ("gs://dataproc-staging-us-west3-650974721448-vbkdchoj/test30_augmented.csv"),header=True, inferSchema= True)
train = spark.read.csv( ("gs://dataproc-staging-us-west3-650974721448-vbkdchoj/train70_augmented.csv"),header=True, inferSchema= True)
DF = train.union(test)
DF = DF.toDF(*(c.replace('.', '_') for c in DF.columns))
train = train.toDF(*(c.replace('.', '_') for c in train.columns))
test = test.toDF(*(c.replace('.', '_') for c in test.columns))
train = train.drop("mqtt_hdrflags", "tcp_flags", 'mqtt_conack_flags','mqtt_conflags','mqtt_msg',
        'mqtt_protoname')
test = test.drop("mqtt_hdrflags", "tcp_flags", 'mqtt_conack_flags','mqtt_conflags','mqtt_msg',
        'mqtt_protoname')
train = train.limit(1400)
test = test.limit(600)

AnalysisException: Path does not exist: gs://dataproc-staging-us-west3-650974721448-vbkdchoj/test30_augmented.csv

In [None]:
preprocess_pipeline = get_preprocess_pipeline("binary")
preprocess_pipeline_model = preprocess_pipeline.fit(train)

train_df = preprocess_pipeline_model.transform(train)
test_df = preprocess_pipeline_model.transform(test)

In [None]:
to_array = udf(lambda v: v.toArray().tolist(), ArrayType(FloatType()))

df_train = train_df
df_validate,df_test = test_df.randomSplit([0.5,0.5])

In [None]:
df_train_pandas = df_train.withColumn('features', to_array('features')).toPandas()
df_validate_pandas = df_validate.withColumn('features', to_array('features')).toPandas()
df_test_pandas = df_test.withColumn('features', to_array('features')).toPandas()

In [None]:
import tensorflow as tf
from tensorflow import keras 

# Converting the pandas DataFrame to tensors
# Note we are using 3 data sets train, validate, test

x_train = tf.constant(np.array(df_train_pandas['features'].values.tolist()))
y_train = tf.constant(np.array(df_train_pandas['outcome'].values.tolist()))

x_validate = tf.constant(np.array(df_validate_pandas['features'].values.tolist()))
y_validate = tf.constant(np.array(df_validate_pandas['outcome'].values.tolist()))


x_test = tf.constant(np.array(df_test_pandas['features'].values.tolist()))
y_test = tf.constant(np.array(df_test_pandas['outcome'].values.tolist()))

#### Shallow NN

In [None]:
import datetime
model = keras.Sequential( [keras.layers.Dense(30,activation='relu'),
                           keras.layers.Dense(1)] )
model.compile(optimizer = 'sgd',
    loss=keras.losses.BinaryCrossentropy(from_logits=True),
    metrics=[keras.metrics.AUC(from_logits=True),keras.metrics.BinaryAccuracy()])

model.fit(x_train,y_train, epochs = 10,validation_data=(x_validate,y_validate),verbose = 2)

#### Shuffling

In [None]:
import tensorflow as tf
from tensorflow import keras 
train = tf.concat([x_train,tf.reshape(y_train,[-1,1])],1)
train_shuffle = tf.random.shuffle(train)
x_train_shuffle = train_shuffle[:,0:tf.shape(x_train)[1]]
y_train_shuffle = train_shuffle[:,tf.shape(x_train)[1]]

#### Hyperparameters

In [None]:
from tensorboard.plugins.hparams import api as hp

HP_WIDTH = hp.HParam('NN_width', hp.Discrete([20,30,40]))
HP_DEPTH = hp.HParam('NN_depth', hp.Discrete([1,2]))


with tf.summary.create_file_writer('logs1483/hparams_tuning').as_default():
  hp.hparams_config(
    hparams=[HP_WIDTH, HP_DEPTH],
    metrics=[hp.Metric('Accuracy')],
  )

#### Cross Validation

In [None]:
def CV_Binary_model(hparams,logdir, k, current_best, d, w):
    indiceslist = []
    
    for i in range(k-1):
        indices = tf.range(i * ((tf.shape(x_train_shuffle)[0])//k), (i + 1)* ((tf.shape(x_train_shuffle)[0])//k),1).numpy().tolist()
        indiceslist.append([indices])
        
    accuracy = 0
    # combining whatever remaining after k-1 splits (to account if length of dataset is not divisible by k)
    
    indices = tf.range((k-1) * ((tf.shape(x_train_shuffle)[0])//k), (tf.shape(x_train_shuffle)[0]),1).numpy().tolist()
    
    indiceslist.append([indices]) ## indiceslist to divide train and validate
    
    for i in range(k):
        print("\nSplit no",i+1)
        model = keras.Sequential()
        for _ in range(hparams[HP_DEPTH]):
            model.add(keras.layers.Dense(hparams[HP_WIDTH],activation='relu'))
        model.add(keras.layers.Dense(1))
        model.compile(optimizer = 'sgd',
        loss=keras.losses.BinaryCrossentropy(from_logits=True),
        metrics=[keras.metrics.AUC(from_logits=True),keras.metrics.BinaryAccuracy()])
        
        b = indiceslist[i][0] 
        a = [int(item) for item in b]

        x_validate = tf.gather(x_train_shuffle,a)

        y_validate = tf.gather(y_train_shuffle,a)
        
        z = []
        
        for j in range(k):
            
            if j != i:                     ## Make sure validation and train set are different
                z =  z + indiceslist[j][0]
                
        a = [int(item) for item in z]
        
        x_train = tf.gather(x_train_shuffle, a)
        y_train = tf.gather(y_train_shuffle, a)
        
        print("\nTraining")
        
        history = model.fit(x_train, y_train, epochs= 10,validation_data = (x_validate,y_validate), verbose = 2)
        if np.max(history.history["val_binary_accuracy"]) > current_best:
                  current_best = np.max(history.history["val_binary_accuracy"])
                  d = hparams[HP_DEPTH]
                  w = hparams[HP_WIDTH]
        accuracy = accuracy + np.max(history.history["val_binary_accuracy"])
    
    return accuracy/k, d, w, current_best

#### Tuning

In [None]:
k =3 
current_best = 0
d,w = 0,0
for hp_width in HP_WIDTH.domain.values:
  for hp_depth in (HP_DEPTH.domain.values):
    hparams = {
        HP_WIDTH: hp_width,
        HP_DEPTH: hp_depth,
    }
    run_name = f"run-WIDTH{int(hparams[HP_WIDTH])}-DEPTH{hparams[HP_DEPTH]}"
    print('--- Starting trial: %s' % run_name)
    print({h.name: hparams[h] for h in hparams})

    run_dir = 'logs1483/hparams_tuning/' + run_name
    accuracy, d, w, current_best = CV_Binary_model(hparams,run_dir, k, current_best, d, w)

    with tf.summary.create_file_writer(run_dir).as_default():
      hp.hparams(hparams)  # record the values used in this trial
      tf.summary.scalar("Accuracy", accuracy, step=1)

#### Best parameters

In [None]:
print("Tuned Depth for Shallow NN = ",d)
print("Tuned Widhth for Shallow NN = ",w)

In [None]:
model = keras.Sequential( [keras.layers.Dense(40,activation='relu'),
                           keras.layers.Dense(1)] )

model.compile(optimizer = 'sgd',
    loss=keras.losses.BinaryCrossentropy(from_logits=True),
    metrics=[keras.metrics.AUC(from_logits=True),keras.metrics.BinaryAccuracy()])

model.fit(x_train,y_train, epochs = 20,validation_data=(x_validate,y_validate),verbose = 2)

#### Testing on Test set

In [None]:
print("Evaluate on test data")
results = model.evaluate(x_test, y_test)
print("test accuracy = ", results[2]);

#### Deep NN

In [None]:
import datetime
model = keras.Sequential( [keras.layers.Dense(30,activation='relu'),keras.layers.Dense(30,activation='relu'),
                           keras.layers.Dense(30,activation='relu'),keras.layers.Dense(30,activation='relu'),
                           keras.layers.Dense(1)] )
model.compile(optimizer = 'sgd',
    loss=keras.losses.BinaryCrossentropy(from_logits=True),
    metrics=[keras.metrics.AUC(from_logits=True),keras.metrics.BinaryAccuracy()])

model.fit(x_train,y_train, epochs = 10,validation_data=(x_validate,y_validate),verbose = 2)

#### Shuffling

In [None]:
import tensorflow as tf
from tensorflow import keras 
train = tf.concat([x_train,tf.reshape(y_train,[-1,1])],1)
train_shuffle = tf.random.shuffle(train)
x_train_shuffle = train_shuffle[:,0:tf.shape(x_train)[1]]
y_train_shuffle = train_shuffle[:,tf.shape(x_train)[1]]

#### Hyper parameters

In [None]:
from tensorboard.plugins.hparams import api as hp

HP_WIDTH = hp.HParam('NN_width', hp.Discrete([20,30,40]))
HP_DEPTH = hp.HParam('NN_depth', hp.Discrete([4,5,6]))


with tf.summary.create_file_writer('logs1483/hparams_tuning').as_default():
  hp.hparams_config(
    hparams=[HP_WIDTH, HP_DEPTH],
    metrics=[hp.Metric('Accuracy')],
  )

#### CrossValidation

In [None]:
def CV_Deep_Binary_model(hparams,logdir, k, current_best, d, w):
    indiceslist = []
    
    for i in range(k-1):
        indices = tf.range(i * ((tf.shape(x_train_shuffle)[0])//k), (i + 1)* ((tf.shape(x_train_shuffle)[0])//k),1).numpy().tolist()
        indiceslist.append([indices])
        
    accuracy = 0
    # combining whatever remaining after k-1 splits (to account if length of dataset is not divisible by k)
    
    indices = tf.range((k-1) * ((tf.shape(x_train_shuffle)[0])//k), (tf.shape(x_train_shuffle)[0]),1).numpy().tolist()
    
    indiceslist.append([indices]) ## indiceslist to divide train and validate
    
    for i in range(k):
        print("\nSplit no",i+1)
        model = keras.Sequential()
        for _ in range(hparams[HP_DEPTH]):
            model.add(keras.layers.Dense(hparams[HP_WIDTH],activation='relu'))
        model.add(keras.layers.Dense(1))
        model.compile(optimizer = 'sgd',
        loss=keras.losses.BinaryCrossentropy(from_logits=True),
        metrics=[keras.metrics.AUC(from_logits=True),keras.metrics.BinaryAccuracy()])
        
        b = indiceslist[i][0] 
        a = [int(item) for item in b]

        x_validate = tf.gather(x_train_shuffle,a)

        y_validate = tf.gather(y_train_shuffle,a)
        
        z = []
        
        for j in range(k):
            
            if j != i:                     ## Make sure validation and train set are different
                z =  z + indiceslist[j][0]
                
        a = [int(item) for item in z]
        
        x_train = tf.gather(x_train_shuffle, a)
        y_train = tf.gather(y_train_shuffle, a)
        
        print("\nTraining")
        
        history = model.fit(x_train, y_train, epochs= 10,validation_data = (x_validate,y_validate), verbose = 2)
        if np.max(history.history["val_binary_accuracy"]) > current_best:
                  current_best = np.max(history.history["val_binary_accuracy"])
                  d = hparams[HP_DEPTH]
                  w = hparams[HP_WIDTH]
        accuracy = accuracy + np.max(history.history["val_binary_accuracy"])
    
    return accuracy/k, d, w, current_best

#### Tuning

In [None]:
k =3 
current_best = 0
d,w = 0,0
for hp_width in HP_WIDTH.domain.values:
  for hp_depth in (HP_DEPTH.domain.values):
    hparams = {
        HP_WIDTH: hp_width,
        HP_DEPTH: hp_depth,
    }
    run_name = f"run-WIDTH{int(hparams[HP_WIDTH])}-DEPTH{hparams[HP_DEPTH]}"
    print('--- Starting trial: %s' % run_name)
    print({h.name: hparams[h] for h in hparams})

    run_dir = 'logs1483/hparams_tuning/' + run_name
    accuracy, d, w, current_best = CV_Deep_Binary_model(hparams,run_dir, k, current_best, d, w)

    with tf.summary.create_file_writer(run_dir).as_default():
      hp.hparams(hparams)  # record the values used in this trial
      tf.summary.scalar("Accuracy", accuracy, step=1)

#### Tuned Parameters

In [None]:
print("Tuned Depth for Deep NN = ",d)
print("Tuned Widhth for Deep NN = ",w)

In [None]:
model = keras.Sequential( [keras.layers.Dense(40,activation='relu'),keras.layers.Dense(40,activation='relu'),
                           keras.layers.Dense(40,activation='relu'),keras.layers.Dense(40,activation='relu'),
                           keras.layers.Dense(1)] )

model.compile(optimizer = 'sgd',
    loss=keras.losses.BinaryCrossentropy(from_logits=True),
    metrics=[keras.metrics.AUC(from_logits=True),keras.metrics.BinaryAccuracy()])

model.fit(x_train,y_train, epochs = 20,validation_data=(x_validate,y_validate),verbose = 2)

### Testing on test set

In [None]:
print("Evaluate on test data")
results = model.evaluate(x_test, y_test)
print("test accuracy = ", results[2]);