In [None]:
%tensorflow_version 2.x
import math
import time

import matplotlib.pyplot as plt
# import seaborn

from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score

import random

import sys

import keras
import numpy as np
#np.random.seed(0) # make random consistent

# Read data

In [None]:
import os
import re

import numpy as np
import pandas
from tqdm.notebook import tqdm

import signal


class TimeoutException(Exception):  # Custom exception class
    pass


def timeout_handler(signum, frame):  # Custom signal handler
    raise TimeoutException


# Change the behavior of SIGALRM
signal.signal(signal.SIGALRM, timeout_handler)


class DataImporter:
    """
    loads data set from the raw dataset
    """

    def __init__(self, log_template, dataset_folder_path, dataset_name, dataset_step=1,
                 dataset_limit=100000, dataset_type='main', normal_indicator:str='-', aux_count=50000):
        self.log_template = log_template  # a template containing <Token{n}> and <Message>
        self.log_dataframe = None
        self.dataset_folder_path: str = dataset_folder_path  # path to the dataset folder
        self.dataset_name: str = dataset_name  # full name of raw dataset
        self.step: int = dataset_step  # step taken to sample auxiliary dataset
        self.log_template_regex: re = re.compile(r'')
        self.log_template_headers: list[str] = []
        self.limit: int = dataset_limit  # used for faster experiment only
        self.dataset_type: str = dataset_type
        self.normal_indicator: str = normal_indicator # a sign indicating the log line is anomaly
        self.aux_count: int = aux_count
        
    def log_loader(self):
        """
        read from IO stream and only take the actual log message based on template
        :return:
        """
        log_messages = []
        counter = 0
        # there's uncommon encoding in dataset BG/P
        with open(os.path.join(self.dataset_folder_path, self.dataset_name), 'r', encoding="latin-1") as ds:
            for line_no, line in enumerate(tqdm(ds)):
                if line_no % self.step == 0:  # jump over steps
                    try:
                        #signal.alarm(30)

                        try:
                            match = self.log_template_regex.search(line.strip())
                            message = [match.group(header) for header in self.log_template_headers]
                            # if self.dataset_name=='Intrepid_RAS_0901_0908_scrubbed_small':
                            #   print(message)
                            log_messages.append(message)
                            counter += 1
                        except Exception:
                            #print("Regex hang detected, skipping")
                            pass  # catastrophic backtracking
                    except TimeoutException:
                      pass
                if line_no == self.limit:
                    break
        df = pandas.DataFrame(log_messages, columns=self.log_template_headers)
        df.insert(0, 'LineId', None)
        df['LineId'] = [i + 1 for i in range(counter)]
        return df

    def load(self):
        self.log_template_matcher()

        self.log_dataframe = self.log_loader()

        # differentiate anomaly with normal log
        log_messages= self.log_dataframe.Message
        true_labels = np.where(self.log_dataframe.Token0.values == self.normal_indicator, 0, 1)

        if self.dataset_type == 'auxiliary':
            print(log_messages.iloc[true_labels.flatten() == 0].shape)
            print(log_messages.iloc[true_labels.flatten() == 1])
            df_normal = log_messages.iloc[true_labels.flatten() == 0].sample(n=self.aux_count).values
            df_anomalies = log_messages.iloc[true_labels.flatten() == 1].sample(n=self.aux_count).values
            return df_normal, df_anomalies
        elif self.dataset_type == 'main':
            return log_messages, true_labels

    def load_special(self):
        self.log_template_matcher()

        self.log_dataframe = self.log_loader()

        # differentiate anomaly with normal log
        log_messages= self.log_dataframe.Message
        true_labels = np.where(self.log_dataframe.Token0.values == self.normal_indicator, 0, 1)

        if self.dataset_type == 'auxiliary':
            df_normal = log_messages.iloc[true_labels.flatten() == 0].sample(n=self.aux_count).values
            df_anomalies = log_messages.iloc[true_labels.flatten() == 1].sample(n=self.aux_count).values
            return df_normal, df_anomalies
        elif self.dataset_type == 'main':
            return log_messages, true_labels

    def log_template_matcher(self):
        headers = []
        template_chunks = re.split(r'(<[^<>]+>)', self.log_template)
        expression = ''
        for template_chunk_idx in range(len(template_chunks)):
            if template_chunk_idx % 2 == 0:
                splitter = re.sub(' +', '\\\s+', template_chunks[template_chunk_idx])
                expression += splitter
            else:
                header = template_chunks[template_chunk_idx].strip('<').strip('>')
                expression += '(?P<%s>.+?)' % header  # change * from +
                headers.append(header)
        print(expression)
        expression = re.compile('^' + expression + '$')

        self.log_template_headers, self.log_template_regex = headers, expression

    def pickle_processed(self, processed):
        """
        pickle the df with only log message to a file
        :return:
        """
        import pickle
        pickle_path = os.path.join(self.dataset_folder_path, f'{self.dataset_name}_processed.pkl')
        with open(pickle_path) as cached:
            print(f"Dumping processed dataset to pickle file path - {pickle_path}")
            pickle.dump(processed, cached)


# Tokenize data 

Use NLTK and regex to remove http endpoints, stopwords, numerical words. Also turn to lower case.

Finally add [CLS]

In [None]:
# Get NLTK data dicts
import re
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
"""
Standard tokenizer + do what the paper says
"""
class DataTokenizer:
    def __init__(self):
        self.word2index = {'[PAD]': 0, '[CLS]': 1, '[MASK]': 2}
        self.num_words = 3
        self.stop_words = set(stopwords.words('english'))

    def tokenize(self, message):
        # paper section IV: Tokenization processing
        message = message.lower()
        message = re.sub(r'/.*:', '', message, flags=re.MULTILINE)  # filter for endpoints
        message = re.sub(r'/.*', '', message, flags=re.MULTILINE)
        message = word_tokenize(message)                # remove non words
        message = [word for word in message if word.isalpha()]  # remove numerical
        message = [word for word in message if word not in self.stop_words]  # remove nltk common stopwords
        #message = ['[CLS]'] + message  # add embedding token
        for word_idx, word in enumerate(message):  # convert to value
            if word not in self.word2index:
                self.word2index[word] = self.num_words
                self.num_words += 1
            message[word_idx] = self.word2index[word]
        return message


In [None]:
from google.colab import drive
drive.mount('/content/drive')

# special to google colab
folder_path = 'drive/MyDrive/logsy_data/dataset'

Mounted at /content/drive


In [None]:
# TODO move small dataset to same named but in dataset_small folder
## loading bgp https://www.usenix.org/sites/default/files/4372-intrepid_ras_0901_0908_scrubbed.zip.tar 1.0GB; this dataset uses anomaly indicator 'FATAL'
bgp_template = '<Token1> <Token2>          <Token3>       <Token4>                  <Token5>    <Token0> <Message>'
name = 'Intrepid_RAS_0901_0908_scrubbed'  # BG/P

# use special loader
special_d, special_l = DataImporter(log_template=bgp_template, dataset_folder_path=folder_path,
                                    dataset_name=name, dataset_step=1, dataset_type='main',dataset_limit=11000000, normal_indicator='FATAL').load_special()
third_anomaly = special_d[special_l==0]
print(f'\nSuccessfully imported - {len(special_d)}  => {len(third_anomaly)} Messages from dataset at {os.path.join(folder_path, name)}\n\n')

print(third_anomaly[1:5])
# third_normal = third_data[label==1]
# third_anomaly = third_data[label==0]
## Use others as auxiliary 
### loading spirit http://0b4af6cdc2f0c5998459-c0245c5c937c5dedcca3f1764ecc9b2f.r43.cf2.rackcdn.com/hpc4/spirit2.gz
#### big one so step should be bigger, 39GB of data; this dataset uses anomaly indicator '-'
spirit_template = '<Token0> <Token1> <Token2> <Token3> <Token4> <Token5> <Token6> <Token7> <Message>'
#name = 'spirit2'
name = 'spirit_small'
dataset_limit = 6000000
first_normal, first_anomaly = DataImporter(log_template=spirit_template, dataset_folder_path=folder_path,
                                    dataset_name=name, dataset_step=3, dataset_type='auxiliary',dataset_limit=dataset_limit, normal_indicator='-', aux_count=int(dataset_limit*0.02)).load()
print(f'\nSuccessfully imported - {len(first_normal)} => {len(first_anomaly)} Messages from dataset at {os.path.join(folder_path, name)}\n\n')
print(first_anomaly[1:5])
### loading liberty http://0b4af6cdc2f0c5998459-c0245c5c937c5dedcca3f1764ecc9b2f.r43.cf2.rackcdn.com/hpc4/liberty2.gz
### 30GB not used yet
...

# thunderbird_template = '<Token0> <Token1> <Token2> <Token3> <Token4> <Token5> <Token6> <Token7> <Token8>(\[<Token9>\])?: <Message>'

# name = 'tbird2_small'  # original dataset is too big, limit to 5,000,000 rows
# second_normal, second_anomaly = DataImporter(log_template=thunderbird_template, dataset_folder_path=folder_path,
#                                     dataset_name=name, dataset_step=1, dataset_type='auxiliary',dataset_limit=dataset_limit, normal_indicator='-', aux_count=int(dataset_limit*0.02)).load()
# print(f'\nSuccessfully imported - {len(second_normal)} => {len(second_anomaly)} Messages from dataset at {os.path.join(folder_path, name)}\n\n')



### BG/L http://0b4af6cdc2f0c5998459-c0245c5c937c5dedcca3f1764ecc9b2f.r43.cf2.rackcdn.com/hpc4/bgl2.gz 0.72GB
bgl_template = '<Token0> <Token1> <Token2> <Token3> <Token4> <Token5> <Token6> <Token7> <Token8> <Message>'  # bgl style token template
name = 'bgl2'  # BG/P

second_normal, second_anomaly = DataImporter(log_template=bgl_template, dataset_folder_path=folder_path,
                                    dataset_name=name, dataset_step=1, dataset_type='auxiliary', dataset_limit=5000000, normal_indicator='-', aux_count=int(dataset_limit*0.02)).load()
print(f'\nSuccessfully imported - {len(second_normal)} => {len(second_anomaly)} Messages from dataset at {os.path.join(folder_path, name)}\n\n')
print(second_anomaly[1:5])



(?P<Token1>.+?)\s+(?P<Token2>.+?)\s+(?P<Token3>.+?)\s+(?P<Token4>.+?)\s+(?P<Token5>.+?)\s+(?P<Token0>.+?)\s+(?P<Message>.+?)


0it [00:00, ?it/s]


Successfully imported - 2834486  => 30064 Messages from dataset at drive/MyDrive/logsy_data/dataset/Intrepid_RAS_0901_0908_scrubbed


8425    2009-01-08-07.41.39.355367 -                  ...
8479    2009-01-08-15.42.37.563676 -                  ...
8499    2009-01-08-16.12.46.273384 -                  ...
8530    2009-01-08-17.45.27.669492 -                  ...
Name: Message, dtype: object
(?P<Token0>.+?)\s+(?P<Token1>.+?)\s+(?P<Token2>.+?)\s+(?P<Token3>.+?)\s+(?P<Token4>.+?)\s+(?P<Token5>.+?)\s+(?P<Token6>.+?)\s+(?P<Token7>.+?)\s+(?P<Message>.+?)


0it [00:00, ?it/s]

(1411563,)
2                   kernel: hda: drive not ready for command
3                   kernel: hda: drive not ready for command
4                 kernel: hda: status error: status=0x00 { }
6                   kernel: hda: drive not ready for command
7                 kernel: hda: status error: status=0x00 { }
                                 ...                        
1666582    pbs_mom: scan_for_exiting, system epilog faile...
1666606    pbs_mom: scan_for_exiting, system epilog faile...
1666628    pbs_mom: scan_for_exiting, system epilog faile...
1666649    pbs_mom: scan_for_exiting, system epilog faile...
1666654    pbs_mom: scan_for_exiting, system epilog faile...
Name: Message, Length: 255100, dtype: object

Successfully imported - 120000 => 120000 Messages from dataset at drive/MyDrive/logsy_data/dataset/spirit_small


['kernel: hda: status error: status=0x00 { }'
 'kernel: hda: status error: status=0x00 { }'
 'kernel: hda: drive not ready for command'
 'kernel: hda: status 

0it [00:00, ?it/s]

(4364795,)
4903       ddr: excessive soft failures, consider replaci...
14143      ddr: excessive soft failures, consider replaci...
14737      ciod: failed to read message prefix on control...
14738      ciod: failed to read message prefix on control...
14739      ciod: failed to read message prefix on control...
                                 ...                        
4713488        idoproxy communication failure: socket closed
4713489        idoproxy communication failure: socket closed
4713490        idoproxy communication failure: socket closed
4713491        idoproxy communication failure: socket closed
4713492        idoproxy communication failure: socket closed
Name: Message, Length: 348698, dtype: object

Successfully imported - 120000 => 120000 Messages from dataset at drive/MyDrive/logsy_data/dataset/bgl2


['ciod: Error reading message prefix on CioStream socket to 172.16.96.116:36722, Link has been severed'
 'data TLB error interrupt'
 'Lustre mount FAILED : bglio636 :

In [None]:
##################### < THIS PART WORKS PROPERLY
# concatenate the 3 auxiliary datasets
concat_normal = [] # not needed, auxiliary data are all treated as anomalies
print(third_anomaly)
print(len(first_anomaly), len(second_anomaly), len(third_anomaly))
concat_anomaly = np.append(first_anomaly, second_anomaly) 
concat_anomaly = np.append(concat_anomaly, third_anomaly)

print(len(concat_anomaly))
# sampling auxiliary data from concat # we have 300000 =int(dataset_limit*0.05)
aux_anomalies = np.random.choice(concat_anomaly, size=250000, replace=False)
print(aux_anomalies.shape)
###
# 12.5% is anomaly aux

8310       2009-01-08-02.54.41.805715 -                  ...
8425       2009-01-08-07.41.39.355367 -                  ...
8479       2009-01-08-15.42.37.563676 -                  ...
8499       2009-01-08-16.12.46.273384 -                  ...
8530       2009-01-08-17.45.27.669492 -                  ...
                                 ...                        
2829445    2009-08-20-18.59.36.664765   0   _DIAGS_R07-M1...
2829446    2009-08-20-18.59.37.086821   0   _DIAGS_R07-M1...
2830363    2009-08-20-19.13.24.764997   0   _DIAGS_R06-M1...
2830675    2009-08-20-19.13.57.839694   0   _DIAGS_R07-M1...
2831541    2009-08-20-19.38.43.492988   0   _DIAGS_R04-M0...
Name: Message, Length: 30064, dtype: object
120000 120000 30064
270064
(10000,)


In [None]:
############################ Loading main data and auxiliary data (for testing purposes use small version)
## use tbird2 as main - currently using BG/L

# ### BG/L http://0b4af6cdc2f0c5998459-c0245c5c937c5dedcca3f1764ecc9b2f.r43.cf2.rackcdn.com/hpc4/bgl2.gz 0.72GB
# bgl_template = '<Token0> <Token1> <Token2> <Token3> <Token4> <Token5> <Token6> <Token7> <Token8> <Message>'  # bgl style token template
# name = 'bgl' 

# log_messages, labels = DataImporter(log_template=bgl_template, dataset_folder_path=folder_path,
#                                     dataset_name=name, dataset_step=1, dataset_type='main', dataset_limit=5000000, normal_indicator='-', aux_count=int(dataset_limit*0.02)).load()
# print(f'\nSuccessfully imported - {len(second_normal)} => {len(second_anomaly)} Messages from dataset at {os.path.join(folder_path, name)}\n\n')


thunderbird_template = '<Token0> <Token1> <Token2> <Token3> <Token4> <Token5> <Token6> <Token7> <Token8>(\[<Token9>\])?: <Message>'

name = 'tbird2_medium_200m_40step'  # original dataset is too big, limit to 5,000,000 rows
log_messages, labels = DataImporter(log_template=thunderbird_template, dataset_folder_path=folder_path,
                                    dataset_name=name, dataset_step=1, dataset_type='main',dataset_limit=5000000, normal_indicator='-', aux_count=int(dataset_limit*0.02)).load()
print(f'\nSuccessfully imported - {len(log_messages)} => {len(labels)} Messages from dataset at {os.path.join(folder_path, name)}\n\n')


print(log_messages)
print(log_messages.shape)
print(labels.shape)
labels = labels.reshape(-1, 1) # reshape
print(labels.shape)
print(labels[0])


(?P<Token0>.+?)\s+(?P<Token1>.+?)\s+(?P<Token2>.+?)\s+(?P<Token3>.+?)\s+(?P<Token4>.+?)\s+(?P<Token5>.+?)\s+(?P<Token6>.+?)\s+(?P<Token7>.+?)\s+(?P<Token8>.+?)(\[(?P<Token9>.+?)\])?:\s+(?P<Message>.+?)


0it [00:00, ?it/s]


Successfully imported - 99801 => 99801 Messages from dataset at drive/MyDrive/logsy_data/dataset/tbird2_medium_200m_40step


0                     tftp: client does not accept options
1                     tftp: client does not accept options
3                     tftp: client does not accept options
4                  session opened for user root by (uid=0)
                               ...                        
99796                          Got trap from peer on fd 13
99797    Instrumentation Service EventID: 1052 Temperat...
99798    [ib_sm_discovery.c:1103]: Failed discover node...
99799                          Got trap from peer on fd 13
99800    [ib_sm_discovery.c:470]: Failed to GetNodeInfo...
Name: Message, Length: 99801, dtype: object
(99801,)
(99801,)
(99801, 1)
[0]


In [None]:
#append the anomalies to the full data
concat_messages = np.append(log_messages.values.reshape(-1,1), aux_anomalies.reshape(-1,1), axis=0)
print(labels.shape) 
print(np.ones(len(aux_anomalies)).shape)
concat_labels = np.append(labels,  np.ones(len(aux_anomalies)).reshape(-1,1), axis=0).flatten()
concat_messages.shape, concat_labels.shape
import collections
collections.Counter(concat_labels)

(99801, 1)
(10000,)


Counter({0.0: 95732, 1.0: 14069})

In [None]:
############################# Tokenize full data
from tqdm.notebook import trange
print(f'Starting to tokenize messages, pushing result to pickle(TODO)')
tokenizer = DataTokenizer()
data_tokenized = []
print("##################### Data Shape ##############")
print(concat_messages.shape, concat_labels.shape)
print("##################### Data Shape End ##############")
df_len = int(concat_messages.shape[0])
for i in trange(df_len):
    tokenized = tokenizer.tokenize(concat_messages[i][0])
    data_tokenized.append(tokenized)

data_tokenized = np.asanyarray(data_tokenized)
print(data_tokenized.shape)

import pickle
print(f"vocab size - {tokenizer.num_words}")
vocab_size = tokenizer.num_words
print(folder_path)
with open(f'{folder_path}/pickled_concat', 'wb') as message_file:
    pickle.dump(data_tokenized, message_file)

Starting to tokenize messages, pushing result to pickle(TODO)
##################### Data Shape ##############
(109801, 1) (109801,)
##################### Data Shape End ##############


  0%|          | 0/109801 [00:00<?, ?it/s]

(109801,)
vocab size - 1102
drive/MyDrive/logsy_data/dataset


  return array(a, dtype, copy=False, order=order, subok=True)


# Split data to train set and test set

variables: 

    data_tokenized: all data

    labels : all labels

In [None]:
# load from file directly without tokenization

with open(f'{folder_path}/pickled_concat','rb') as p:
  data_tokenized = pickle.load(p)

In [None]:
ratio = 0.5
train_size = int(len(log_messages) * ratio)
test_size = int(len(log_messages) * (1-ratio))
print(train_size, test_size)
print(train_size/len(log_messages))


49900 49900
0.49999499003016


In [None]:
# def split_data(data, labels, train_size, test_size):
#     print(train_size, test_size)
#     x_train, = np.append(data[:train_size][labels[:train_size]==0], data[train_size:])
#     y_train = labels[:train_size][labels[:train_size]==0]
#     x_test =  data[train_size:][labels[train_size:]==1]
#     y_test = labels[train_size:][labels[train_size:]==1]
#     print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)
#     return x_train, y_train, x_test, y_test

# #from sklearn.model_selection import train_test_split
# #x_train, y_train, x_test, y_test = train_test_split(pd, labels, test_size=0.2, random_state=42)

# x_train, y_train, x_test, y_test = split_data(pd, labels, train_size, test_size)
# len(x_train), len(x_val), len(y_train), len(y_val)
from collections import Counter
collections.Counter(concat_labels)
#print(Counter(labels))  # target set
print(data_tokenized.shape) # tokenized concat shape

print(len(log_messages))
print(len(concat_labels))

target_size = len(log_messages)
a = collections.Counter(concat_labels[target_size:])
print(a)
x_train = np.append(data_tokenized[:train_size][concat_labels[:train_size]==0], data_tokenized[target_size:],axis=0)
y_train = np.append(concat_labels[:train_size][concat_labels[:train_size]==0].flatten(), concat_labels[target_size:].flatten(),axis=0)
x_test = data_tokenized[train_size:target_size]
y_test = concat_labels[train_size:target_size]
# print(x_train, y_train, x_test, y_test )
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

print(collections.Counter(y_train))
print(collections.Counter(y_test))



(109801,)
99801
109801
Counter({1.0: 10000})
(59740,) (59740,) (49901,) (49901,)
Counter({0.0: 49740, 1.0: 10000})
Counter({0.0: 45992, 1.0: 3909})


In [None]:
from keras.preprocessing.sequence import pad_sequences

x_train = pad_sequences(x_train, maxlen=50, truncating="post", padding="post") 

x_test = pad_sequences(x_test, maxlen=50, truncating="post", padding="post") 
print(x_train.shape, x_test.shape)
print(x_train[0])
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

## padding masks
x_train_masks = tf.equal(x_train, 0)
x_test_masks = tf.equal(x_test, 0)
print(x_train_masks,x_test_masks)

(59740, 50) (49901, 50)
[3 4 5 6 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0]
(59740, 50) (59740,) (49901, 50) (49901,)
tf.Tensor(
[[False False False ...  True  True  True]
 [False False False ...  True  True  True]
 [False False False ...  True  True  True]
 ...
 [False False False ...  True  True  True]
 [False False False ...  True  True  True]
 [False False False ...  True  True  True]], shape=(59740, 50), dtype=bool) tf.Tensor(
[[False False False ...  True  True  True]
 [False False  True ...  True  True  True]
 [False False False ...  True  True  True]
 ...
 [False False False ...  True  True  True]
 [False False False ...  True  True  True]
 [False False False ...  True  True  True]], shape=(49901, 50), dtype=bool)


# Transformer models

## An open-source implementation of standard transformer with multi-head attention, for comparison

In [None]:
import numpy as np
import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras.layers import Layer
from tensorflow.keras.callbacks import Callback
import os



@tf.keras.utils.register_keras_serializable()
class Embedding(tf.keras.layers.Layer):

    def __init__(self, vocab_size, model_dim, **kwargs):
        self._vocab_size = vocab_size
        self._model_dim = model_dim
        super(Embedding, self).__init__(**kwargs)

    def build(self, input_shape):
        self.embeddings = self.add_weight(
            shape=(self._vocab_size, self._model_dim),
            initializer='glorot_uniform',
            name="embeddings")
        super(Embedding, self).build(input_shape)

    def call(self, inputs, **kwargs):
        if K.dtype(inputs) != 'int32':
            inputs = K.cast(inputs, 'int32')
        embeddings = K.gather(self.embeddings, inputs)
        embeddings *= self._model_dim ** 0.5 # Scale
        return embeddings

    def compute_output_shape(self, input_shape):

        return input_shape + (self._model_dim,)


@tf.keras.utils.register_keras_serializable()
class ScaledDotProductAttention(tf.keras.layers.Layer):

    def __init__(self, masking=True, future=False, dropout_rate=0., **kwargs):
        self._masking = masking
        self._future = future
        self._dropout_rate = dropout_rate
        self._masking_num = -2**32+1
        super(ScaledDotProductAttention, self).__init__(**kwargs)

    def mask(self, inputs, masks):
        masks = K.cast(masks, 'float32')
        masks = K.tile(masks, [K.shape(inputs)[0] // K.shape(masks)[0], 1])
        masks = K.expand_dims(masks, 1)
        outputs = inputs + masks * self._masking_num
        return outputs
    
    def future_mask(self, inputs):
        diag_vals = tf.ones_like(inputs[0, :, :])
        tril = tf.linalg.LinearOperatorLowerTriangular(diag_vals).to_dense()  
        future_masks = tf.tile(tf.expand_dims(tril, 0), [tf.shape(inputs)[0], 1, 1])
        paddings = tf.ones_like(future_masks) * self._masking_num
        outputs = tf.where(tf.equal(future_masks, 0), paddings, inputs)
        return outputs

    def call(self, inputs, **kwargs):
        if self._masking:
            assert len(inputs) == 4, "inputs should be set [queries, keys, values, masks]."
            queries, keys, values, masks = inputs
        else:
            assert len(inputs) == 3, "inputs should be set [queries, keys, values]."
            queries, keys, values = inputs

        if K.dtype(queries) != 'float32':  queries = K.cast(queries, 'float32')
        if K.dtype(keys) != 'float32':  keys = K.cast(keys, 'float32')
        if K.dtype(values) != 'float32':  values = K.cast(values, 'float32')

        matmul = K.batch_dot(queries, tf.transpose(keys, [0, 2, 1])) # MatMul
        scaled_matmul = matmul / int(queries.shape[-1]) ** 0.5  # Scale
        if self._masking:
            scaled_matmul = self.mask(scaled_matmul, masks) # Mask(opt.)

        if self._future:
            scaled_matmul = self.future_mask(scaled_matmul)

        softmax_out = K.softmax(scaled_matmul) # SoftMax
        # Dropout
        out = K.dropout(softmax_out, self._dropout_rate)
        
        outputs = K.batch_dot(out, values)

        return outputs

    def compute_output_shape(self, input_shape):
        return input_shape


@tf.keras.utils.register_keras_serializable()
class MultiHeadAttention(tf.keras.layers.Layer):

    def __init__(self, n_heads, head_dim, dropout_rate=.1, masking=True, future=False, trainable=True, **kwargs):
        self._n_heads = n_heads
        self._head_dim = head_dim
        self._dropout_rate = dropout_rate
        self._masking = masking
        self._future = future
        self._trainable = trainable
        super(MultiHeadAttention, self).__init__(**kwargs)

    def build(self, input_shape):
        self._weights_queries = self.add_weight(
            shape=(input_shape[0][-1], self._n_heads * self._head_dim),
            initializer='glorot_uniform',
            trainable=self._trainable,
            name='weights_queries')
        self._weights_keys = self.add_weight(
            shape=(input_shape[1][-1], self._n_heads * self._head_dim),
            initializer='glorot_uniform',
            trainable=self._trainable,
            name='weights_keys')
        self._weights_values = self.add_weight(
            shape=(input_shape[2][-1], self._n_heads * self._head_dim),
            initializer='glorot_uniform',
            trainable=self._trainable,
            name='weights_values')
        super(MultiHeadAttention, self).build(input_shape)

    def call(self, inputs, **kwargs):
        if self._masking:
            assert len(inputs) == 4, "inputs should be set [queries, keys, values, masks]."
            queries, keys, values, masks = inputs
        else:
            assert len(inputs) == 3, "inputs should be set [queries, keys, values]."
            queries, keys, values = inputs
        
        queries_linear = K.dot(queries, self._weights_queries) 
        keys_linear = K.dot(keys, self._weights_keys)
        values_linear = K.dot(values, self._weights_values)

        queries_multi_heads = tf.concat(tf.split(queries_linear, self._n_heads, axis=2), axis=0)
        keys_multi_heads = tf.concat(tf.split(keys_linear, self._n_heads, axis=2), axis=0)
        values_multi_heads = tf.concat(tf.split(values_linear, self._n_heads, axis=2), axis=0)
        
        if self._masking:
            att_inputs = [queries_multi_heads, keys_multi_heads, values_multi_heads, masks]
        else:
            att_inputs = [queries_multi_heads, keys_multi_heads, values_multi_heads]
            
        attention = ScaledDotProductAttention(
            masking=self._masking, future=self._future, dropout_rate=self._dropout_rate)
        att_out = attention(att_inputs)

        outputs = tf.concat(tf.split(att_out, self._n_heads, axis=0), axis=2)
        
        return outputs

    def compute_output_shape(self, input_shape):
        return input_shape

@tf.keras.utils.register_keras_serializable()
class PositionEncoding(Layer):

    def __init__(self, model_dim, **kwargs):
        self._model_dim = model_dim
        super(PositionEncoding, self).__init__(**kwargs)

    def call(self, inputs, **kwargs):
        seq_length = inputs.shape[1]
        position_encodings = np.zeros((seq_length, self._model_dim))
        for pos in range(seq_length):
            for i in range(self._model_dim):
                position_encodings[pos, i] = pos / np.power(10000, (i-i%2) / self._model_dim)
        position_encodings[:, 0::2] = np.sin(position_encodings[:, 0::2]) # 2i
        position_encodings[:, 1::2] = np.cos(position_encodings[:, 1::2]) # 2i+1
        position_encodings = K.cast(position_encodings, 'float32')
        return position_encodings

    def compute_output_shape(self, input_shape):
        return input_shape


@tf.keras.utils.register_keras_serializable()
class Add(Layer):

    def __init__(self, **kwargs):
        super(Add, self).__init__(**kwargs)

    def call(self, inputs, **kwargs):
        input_a, input_b = inputs
        return input_a + input_b

    def compute_output_shape(self, input_shape):
        return input_shape[0]


@tf.keras.utils.register_keras_serializable()
class PositionWiseFeedForward(Layer):
    
    def __init__(self, model_dim, inner_dim, trainable=True, **kwargs):
        self._model_dim = model_dim
        self._inner_dim = inner_dim
        self._trainable = trainable
        super(PositionWiseFeedForward, self).__init__(**kwargs)

    def build(self, input_shape):
        self.weights_inner = self.add_weight(
            shape=(input_shape[-1], self._inner_dim),
            initializer='glorot_uniform',
            trainable=self._trainable,
            name="weights_inner")
        self.weights_out = self.add_weight(
            shape=(self._inner_dim, self._model_dim),
            initializer='glorot_uniform',
            trainable=self._trainable,
            name="weights_out")
        self.bias_inner = self.add_weight(
            shape=(self._inner_dim,),
            initializer='uniform',
            trainable=self._trainable,
            name="bias_inner")
        self.bias_out = self.add_weight(
            shape=(self._model_dim,),
            initializer='uniform',
            trainable=self._trainable,
            name="bias_out")
        super(PositionWiseFeedForward, self).build(input_shape)

    def call(self, inputs, **kwargs):
        if K.dtype(inputs) != 'float32':
            inputs = K.cast(inputs, 'float32')
        inner_out = K.relu(K.dot(inputs, self.weights_inner) + self.bias_inner)
        outputs = K.dot(inner_out, self.weights_out) + self.bias_out
        return outputs

    def compute_output_shape(self, input_shape):
        return self._model_dim


@tf.keras.utils.register_keras_serializable()
class LayerNormalization(Layer):

    def __init__(self, epsilon=1e-8, **kwargs):
        self._epsilon = epsilon
        super(LayerNormalization, self).__init__(**kwargs)

    def build(self, input_shape):
        self.beta = self.add_weight(
            shape=(input_shape[-1],),
            initializer='zero',
            name='beta')
        self.gamma = self.add_weight(
            shape=(input_shape[-1],),
            initializer='one',
            name='gamma')
        super(LayerNormalization, self).build(input_shape)

    def call(self, inputs, **kwargs):
        mean, variance = tf.nn.moments(inputs, [-1], keepdims=True)
        normalized = (inputs - mean) / ((variance + self._epsilon) ** 0.5)
        outputs = self.gamma * normalized + self.beta
        return outputs

    def compute_output_shape(self, input_shape):
        return input_shape


@tf.keras.utils.register_keras_serializable()
class Transformer(Layer):

    def __init__(self,
                 vocab_size,
                 model_dim,
                 n_heads=8,
                 encoder_stack=6,
                 decoder_stack=6,
                 feed_forward_size=2048,
                 dropout_rate=0.1,
                 **kwargs):

        self._vocab_size = vocab_size
        self._model_dim = model_dim
        self._n_heads = n_heads
        self._encoder_stack = encoder_stack
        self._decoder_stack = decoder_stack
        self._feed_forward_size = feed_forward_size
        self._dropout_rate = dropout_rate
        super(Transformer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.embeddings = self.add_weight(
            shape=(self._vocab_size, self._model_dim),
            initializer='glorot_uniform',
            trainable=True,
            name="embeddings")
        self.EncoderPositionEncoding = PositionEncoding(self._model_dim)
        self.EncoderMultiHeadAttentions = [
            MultiHeadAttention(self._n_heads, self._model_dim // self._n_heads)
            for _ in range(self._encoder_stack)
        ]
        self.EncoderLayerNorms0 = [
            LayerNormalization()
            for _ in range(self._encoder_stack)
        ]
        self.EncoderPositionWiseFeedForwards = [
            PositionWiseFeedForward(self._model_dim, self._feed_forward_size)
            for _ in range(self._encoder_stack)
        ]
        self.EncoderLayerNorms1 = [
            LayerNormalization()
            for _ in range(self._encoder_stack)
        ]
        self.DecoderPositionEncoding = PositionEncoding(self._model_dim)
        self.DecoderMultiHeadAttentions0 = [
            MultiHeadAttention(self._n_heads, self._model_dim // self._n_heads, future=True)
            for _ in range(self._decoder_stack)
        ]
        self.DecoderLayerNorms0 = [
            LayerNormalization()
            for _ in range(self._decoder_stack)
        ]
        self.DecoderMultiHeadAttentions1 = [
            MultiHeadAttention(self._n_heads, self._model_dim // self._n_heads)
            for _ in range(self._decoder_stack)
        ]
        self.DecoderLayerNorms1 = [
            LayerNormalization()
            for _ in range(self._decoder_stack)
        ]
        self.DecoderPositionWiseFeedForwards = [
            PositionWiseFeedForward(self._model_dim, self._feed_forward_size)
            for _ in range(self._decoder_stack)
        ]
        self.DecoderLayerNorms2 = [
            LayerNormalization()
            for _ in range(self._decoder_stack)
        ]
        super(Transformer, self).build(input_shape)
        
    def encoder(self, inputs):
        if K.dtype(inputs) != 'int32':
            inputs = K.cast(inputs, 'int32')

        masks = K.equal(inputs, 0)
        # Embeddings
        embeddings = K.gather(self.embeddings, inputs)
        embeddings *= self._model_dim ** 0.5 # Scale
        # Position Encodings
        position_encodings = self.EncoderPositionEncoding(embeddings)
        # Embeddings + Position-encodings
        encodings = embeddings + position_encodings
        # Dropout
        encodings = K.dropout(encodings, self._dropout_rate)

        for i in range(self._encoder_stack):
            # Multi-head-Attention
            attention = self.EncoderMultiHeadAttentions[i]
            attention_input = [encodings, encodings, encodings, masks]
            attention_out = attention(attention_input)
            # Add & Norm
            attention_out += encodings
            attention_out = self.EncoderLayerNorms0[i](attention_out)
            # Feed-Forward
            ff = self.EncoderPositionWiseFeedForwards[i]
            ff_out = ff(attention_out)
            # Add & Norm
            ff_out += attention_out
            encodings = self.EncoderLayerNorms1[i](ff_out)

        return encodings, masks

    def decoder(self, inputs):
        decoder_inputs, encoder_encodings, encoder_masks = inputs
        if K.dtype(decoder_inputs) != 'int32':
            decoder_inputs = K.cast(decoder_inputs, 'int32')

        decoder_masks = K.equal(decoder_inputs, 0)
        # Embeddings
        embeddings = K.gather(self.embeddings, decoder_inputs)
        embeddings *= self._model_dim ** 0.5 # Scale
        # Position Encodings
        position_encodings = self.DecoderPositionEncoding(embeddings)
        # Embeddings + Position-encodings
        encodings = embeddings + position_encodings
        # Dropout
        encodings = K.dropout(encodings, self._dropout_rate)
        
        for i in range(self._decoder_stack):
            # Masked-Multi-head-Attention
            masked_attention = self.DecoderMultiHeadAttentions0[i]
            masked_attention_input = [encodings, encodings, encodings, decoder_masks]
            masked_attention_out = masked_attention(masked_attention_input)
            # Add & Norm
            masked_attention_out += encodings
            masked_attention_out = self.DecoderLayerNorms0[i](masked_attention_out)

            # Multi-head-Attention
            attention = self.DecoderMultiHeadAttentions1[i]
            attention_input = [masked_attention_out, encoder_encodings, encoder_encodings, encoder_masks]
            attention_out = attention(attention_input)
            # Add & Norm
            attention_out += masked_attention_out
            attention_out = self.DecoderLayerNorms1[i](attention_out)

            # Feed-Forward
            ff = self.DecoderPositionWiseFeedForwards[i]
            ff_out = ff(attention_out)
            # Add & Norm
            ff_out += attention_out
            encodings = self.DecoderLayerNorms2[i](ff_out)

        # Pre-SoftMax Embeddings 
        linear_projection = K.dot(encodings, K.transpose(self.embeddings))
        outputs = K.softmax(linear_projection)
        return outputs

    def call(self, encoder_inputs, decoder_inputs, **kwargs):
        encoder_encodings, encoder_masks = self.encoder(encoder_inputs)
        encoder_outputs = self.decoder([decoder_inputs, encoder_encodings, encoder_masks])
        return encoder_outputs

    def compute_output_shape(self, input_shape):
        return input_shape[0][0], input_shape[0][1], self._vocab_size

    def get_config(self):
        config = {
            "vocab_size": self._vocab_size,
            "model_dim": self._model_dim,
            "n_heads": self._n_heads,
            "encoder_stack": self._encoder_stack,
            "decoder_stack": self._decoder_stack,
            "feed_forward_size": self._feed_forward_size,
            "dropout_rate": self._dropout_rate
        }
        base_config = super(Transformer, self).get_config()
        return {**base_config, **config}


class Noam(Callback):

    def __init__(self, model_dim, step_num=0, warmup_steps=4000, verbose=False):
        self._model_dim = model_dim
        self._step_num = step_num
        self._warmup_steps = warmup_steps
        self.verbose = verbose
        super(Noam, self).__init__()

    def on_train_begin(self, logs=None):
        logs = logs or {}
        init_lr = self._model_dim ** -.5 * self._warmup_steps ** -1.5
        K.set_value(self.model.optimizer.lr, init_lr)

    def on_batch_end(self, epoch, logs=None):
        logs = logs or {}
        self._step_num += 1
        lrate = self._model_dim ** -.5 * K.minimum(self._step_num ** -.5, self._step_num * self._warmup_steps ** -1.5)
        K.set_value(self.model.optimizer.lr, lrate)

    def on_epoch_begin(self, epoch, logs=None):
        if self.verbose:
            lrate = K.get_value(self.model.optimizer.lr)
            print(f"epoch {epoch} lr: {lrate}")
    
    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        logs['lr'] = K.get_value(self.model.optimizer.lr)
    

def label_smoothing(inputs, epsilon=0.1):
    output_dim = inputs.shape[-1]
    smooth_label = (1 - epsilon) * inputs + (epsilon / output_dim)
    return smooth_label

# Build Transformer

In [None]:
from keras import backend as K

# def recall_m(y_true, y_pred):
#     y_pred = K.sum(K.square(y_pred), axis=1)
#     true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
#     possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
#     recall = true_positives / (possible_positives + K.epsilon())
#     return recall


def recall_m(y_true, y_pred):
    # y_pred = K.sum(K.square(y_pred), axis=1)
    # true_positives = K.sum(y_true)
    # possible_positives = K.sum(K.round(K.clip(y_pred * y_true, 0 ,1)))
    #y_pred = K.sum(K.square(y_pred), axis=1)
    y_pred = K.sum(K.square(y_pred))

    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    return true_positives / (possible_positives + K.epsilon())


def precision_m(y_true, y_pred):
    y_pred = K.sum(K.square(y_pred),axis=1)

    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    return true_positives / (predicted_positives + K.epsilon())

def f1_m(y_true, y_pred):
    y_pred = K.sum(K.square(y_pred))
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

def accuracy_m(y_true, y_pred):
    y_pred = K.sum(K.square(y_pred))
    acc = K.mean(y_true==K.round(y_pred))

    return acc

def create_model():
  model_dim = 16
  batch_size = 256
  epochs = 10
  max_len = 50
  encoder_inputs = tf.keras.Input(shape=(max_len,), name='encoder_inputs')
  decoder_inputs = tf.keras.Input(shape=(max_len,), name='decoder_inputs')
  vocab_size =tokenizer.n_words
  outputs = Transformer(
      vocab_size, 
      model_dim, 
      n_heads=2, 
      encoder_stack=2,
      decoder_stack=2, 
      feed_forward_size=16
  )(encoder_inputs, decoder_inputs)
  outputs = tf.keras.layers.GlobalAveragePooling1D()(outputs)
  #outputs = tf.keras.layers.
  #outputs = K.sum(K.square(outputs), axis=1)

  # function = lambda x: K.sum(x, axis=1)
  # outputs = tf.keras.layers.Lambda(function, output_shape=(None,1))(outputs)
  # model = tf.keras.Model(inputs=[encoder_inputs, decoder_inputs], outputs=outputs)
  # outputs=tf.keras.activations.sigmoid(outputs)
  model = tf.keras.Model(inputs=[encoder_inputs, decoder_inputs], outputs=outputs)
  return model

# keract.get_activations(model, x, layer_names=None, nodes_to_evaluate=None, output_format='simple', nested=False, auto_compile=True)

# model.compile(optimizer=tf.keras.optimizers.Adam(beta_1=0.9, beta_2=0.98, epsilon=1e-9), 
#     loss='binary_crossentropy', metrics=['accuracy',recall_m, precision_m,f1_m], loss_weights = [0.3, 1.0])
# learning rate decay for optmimizer
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=0.001, # 0.0001,
    decay_steps=10000,
    decay_rate=1-0.001)
# optimizer
model_opt = tf.keras.optimizers.Adam(learning_rate=lr_schedule,beta_1=0.9, beta_2=0.999)

def custom_loss_function(y_true, y_pred):
   import torch
   dist = K.sum(K.square(y_pred),axis=1)
  #  print(f"y_pred -==== {y_pred,y_true}")
  #  print(f"y_pred - 0.0 {y_pred - 0.0}")
   #dist = torch.sum((y_pred[:,0,:] - 0) ** 2, dim=1)
  #  loss = K.mean(y_pred,axis=1)
   #  loss = K.mean((1-y_true)*K.sqrt(dist) - (y_true)*K.log(1-K.exp(-K.sqrt(dist))))
   loss = K.mean((1-y_true)*K.square(dist) - (y_true)*K.log(1-K.exp(-K.square(dist))))
   
   return loss
#model.compile(optimizer=model_opt, loss="binary_crossentropy", metrics=['accuracy',recall_m, precision_m,f1_m],loss_weights = [0.3, 1.0]) #, loss_weights = [0.3, 1.0]
use_tpu = True
if use_tpu:
    # Create distribution strategy
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect()
    strategy = tf.distribute.TPUStrategy(tpu)

    # Create model
    with strategy.scope():
        model = create_model()
model.compile(optimizer=model_opt, loss=custom_loss_function)#,loss_weights = [0.3, 1.0]) #, loss_weights = [0.3, 1.0]
#model.compile(optimizer=model_opt, loss=custom_loss_function, metrics=[accuracy_m,recall_m,precision_m],loss_weights = [0.3, 1.0]) #, loss_weights = [0.3, 1.0]

#es = EarlyStopping(patience=3)
print(model.summary())
model.fit([x_train, x_train_masks], y_train, 
    batch_size=batch_size, epochs=1 )#, validation_data=([x_test,x_test_masks],y_test)) # , callbacks=[es]

# Define spherical loss function and optimizer 

# Test Train

In [None]:

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

"""## Implement a Transformer block as a layer"""

class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.05):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

"""## Implement embedding layer

Two seperate embedding layers, one for tokens, one for token index (positions).
"""

class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions


input_vocab_size = tokenizer.num_words #tokenizer.n_words # size of input data
output_vocab_size = 2 # binary classficiation output: normal or anomaly data
maxlen = 50  # max encoding position for encoder and decoder layer? 
embed_dim = 16  # Embedding size for each token
num_heads = 2  # 2 Number of attention heads
ff_dim = 16  # 16 Hidden layer size in feed forward network inside transformer
dropout_rate = 0.05

inputs = layers.Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, input_vocab_size, embed_dim)

x = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = transformer_block(x)
x = layers.GlobalMaxPooling1D()(x)
# x = layers.Dropout(dropout_rate)(x)
# x = layers.Dense(20, activation="relu")(x)
# x = layers.Dropout(dropout_rate)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
# transformer model
model = keras.Model(inputs=inputs, outputs=outputs)

# learning rate decay for optmimizer
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=0.0001, # 0.0001,
    decay_steps=10000,
    decay_rate=1-0.001)
# optimizer
model_opt = tf.keras.optimizers.Adam(learning_rate=lr_schedule,beta_1=0.9, beta_2=0.999)

In [None]:
#loss_fun = SimpleLossCompute(model,criterion,model_opt)
#model.compile(model_opt, "sparse_categorical_crossentropy", metrics=["accuracy", recall_m, precision_m, f1_m],) # use our own loss
model.compile(model_opt, loss = custom_loss_function, metrics=["accuracy", recall_m, precision_m, f1_m], loss_weights = [0.3, 1.0]) # use our own loss
#model.compile(model_opt, loss = 'binary_crossentropy', metrics=["accuracy", recall_m, precision_m, f1_m],) # use our own loss

In [None]:
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

# idx = np.random.choice(np.arange(len(x_train)), 1000000, replace=False)
# x_train_small = x_train[idx]
# y_train_small = y_train[idx]
# idx_test = np.random.choice(np.arange(len(x_test)), 60000, replace=False)
# x_test_small = x_test[idx_test]
# y_test_small = y_test[idx_test]
# print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)
# print(collections.Counter(y_train_small))
history = model.fit(x_train, y_train,batch_size=256, epochs=30, validation_data=(x_test, y_test), shuffle=True ,)
#history = model.fit(x_train_small, y_train_small, batch_size=2048, epochs=30, validation_data=(x_test_small, y_test_small), shuffle=True ,)

# Train and TE

In [None]:
plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc_score(test_ground_labels.astype(np.int32), max_distances))
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()