In [None]:
import random as rd
import numpy as np
import tensorflow as tf
import tensorflow.keras.backend as K
import os

from keras.engine.training_utils import standardize_input_data

from itertools import chain

from dga_classifier import data

from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import euclidean_distances

In [None]:
#@title Model parameters
#@markdown Select the most appropriate parameters.

nb_data_to_generate = 10000 #@param {type: "slider", min: 100, max:20000}

batch_size = 500 #@param {type: "slider", min: 32, max: 1000}
steps_per_epoch = 256 #@param {type: "slider", min: 1, max: 1000}
nb_epochs = 20 #@param {type: "slider", min: 1, max: 300}

output_dim = 128 #@param {type: "slider", min: 16, max: 512}

nb_cluster_representant = 1 #@param {type: "slider", min: 1, max: 12}

nb_embedding_data = 2000 #@param {type: "slider", min: 64, max: 8192}
batch_size_emb = 100 #@param {type: "slider", min: 32, max: 1000}

#@markdown ---

In [None]:
class DataGenerator(tf.keras.utils.Sequence):
    def __init__(self, data, model, data_initializer, batch_size): # data_initializer, 
        
        self.batch_size = batch_size
        self.model = model
        self.data = data
        self.X = data_initializer(data) # None
        self.nb_anchors = len(data)
        self.anchors = [None] * self.nb_anchors
        self.anchors_exclude = [None] * self.nb_anchors
        self.km = KMeans(n_clusters=nb_cluster_representant, n_jobs=-1)
        
        #self.update()
        self.update_anchors()
        
        for i in range(self.nb_anchors):
            self.anchors_exclude[i] = list(chain(range(i),
                                                 range(i+1, self.nb_anchors)))
    
    
    def update(self):
        self.update_data()
        self.update_anchors()
        
        
    def update_data(self):
        self.X = self.model(self.data)
        
        
    def update_anchors(self):
        for i in range(len(self.anchors)):
            self.anchors[i] = pairwise_distances_argmin_min(
                self.km.fit(self.X[i]).cluster_centers_, self.X[i])[0]
    
    def get_anchor(self, i):
        return self.anchors[i]
        
        
    def generate_data(self):
        # [x-, x, x+], [1, 0]
        data = [[], [], []]
        classes = np.random.randint(0, len(self.anchors), size=self.batch_size)
        for C in classes:
            neg_class = rd.choice(self.anchors_exclude[C])
            
            i_anchor = self.anchors[C][rd.randint(0, nb_cluster_representant-1)]
            i_anchor_neg = self.anchors[neg_class][rd.randint(0, nb_cluster_representant-1)]
            
            data[0].append(self.data[neg_class][i_anchor_neg])
            data[1].append(self.data[C][i_anchor])
            data[2].append(self.data[C][rd.randint(0, len(self.data[C])-1)])
            
        data = [np.array(x) for x in data]
        
        return data, np.array([[1, 0]] * self.batch_size)
    
    def __getitem__(self, index):
        return self.generate_data()

    def __len__(self):
        return steps_per_epoch
    
    #def on_epoch_end(self):
        #self.update()

In [None]:
class DataTranslator():
    def __init__(self, labels, domains):
        self.chars_map = {x:idx+1 for idx, x in enumerate(set('abcdefghijklmnopqrstuvwxyz0123456789.-'))}
        self.chars_map_rev = {idx+1:x for idx, x in enumerate(set('abcdefghijklmnopqrstuvwxyz0123456789.-'))}
        self.labels_map = {x:idx for idx, x in enumerate(set(labels))}
        self.labels_map_rev = {idx:x for idx, x in enumerate(set(labels))}
        
        # Numbers the labels
        self.labels_num = [self.labels_map[x] for x in labels]
        
        # Convert domain names to number of sequences
        # (+ pad at 64 because this is the max len of a domain names)
        self.domains_seq = [[self.chars_map[y] for y in x] for x in domains]
        self.domains_seq = DataTranslator.__pad_seq(self.domains_seq)
        
        self.domains = domains
        self.labels = labels

    def __pad_seq(seq):
        return tf.keras.preprocessing.sequence.pad_sequences(seq, maxlen=64)
        
    def nb_labels(self):
        return len(self.labels_map)
    
    def make_data(self):
        # Create the dataset used for the data generation
        X = [[] for i in range(self.nb_labels())]
        for i in range(len(self.labels_num)):
            X[self.labels_num[i]].append(self.domains_seq[i])
        return X
            
    def get_label_name(self, idx):
        return self.labels_map_rev.get(idx)
    
    def get_label_index(self, label_name):
        return self.labels_map.get(label_name, -1)
    
    def domain_to_vec(self, domain_name):
        if len(domain_name) > 64:
            raise ValueError("domain name should contains less than 64 chars")
        translation = None
        try:
            translation = [self.chars_map[c] for c in domain_name]
        except NameError:
            raise ValueError("given domain name contains unauthorized chars")

        return DataTranslator.__pad_seq([translation])[0]
    
    def vec_to_domain(self, vec):
        str = ''
        for c in vec:
            str += self.chars_map_rev.get(c, '')
        return str
        

In [None]:
labels, domains = zip(*data.get_data(nb_data_to_generate))

In [None]:
translator = DataTranslator(labels, domains)

In [None]:
X = translator.make_data()

In [None]:
def lossless_triplet_loss(y_true, y_pred):
    """
    Implementation of the triplet loss function
    
    Arguments:
    y_true -- true labels, required when you define a loss in Keras, you don't need it in this function.
    y_pred -- python list containing three objects:
            anchor -- the encodings for the anchor data
            positive -- the encodings for the positive data (similar to anchor)
            negative -- the encodings for the negative data (different from anchor)
    N  --  The number of dimension 
    beta -- The scaling factor, N is recommended
    epsilon -- The Epsilon value to prevent ln(0)
    
    
    Returns:
    loss -- real number, value of the loss
    """
    N = output_dim
    beta = N
    epsilon=1e-8
    
    negative = tf.convert_to_tensor(y_pred[:,0:N])
    anchor = tf.convert_to_tensor(y_pred[:,N:N*2]) 
    positive = tf.convert_to_tensor(y_pred[:,N*2:N*3])
    
    # distance between the anchor and the positive
    pos_dist = tf.reduce_sum(tf.square(tf.subtract(anchor,positive)),1)
    # distance between the anchor and the negative
    neg_dist = tf.reduce_sum(tf.square(tf.subtract(anchor,negative)),1)
    
    #Non Linear Values  
    
    # -ln(-x/N+1)
    pos_dist = -tf.log(-tf.divide((pos_dist),beta)+1+epsilon)
    neg_dist = -tf.log(-tf.divide((N-neg_dist),beta)+1+epsilon)
    
    # compute loss
    loss = neg_dist + pos_dist
    
    return loss

def build_model(input_dim, embedding_voc_len, alpha=0.25):
     # Setting the model input
    input_neg = tf.keras.Input(shape=(input_dim,), name='negative') # Input from a different class than the Anchor
    input_anc = tf.keras.Input(shape=(input_dim,), name='anchor')   # Input on which comparaison should be done
    input_pos = tf.keras.Input(shape=(input_dim,), name='positive') # Input of the same class than the Anchor

     # Creation of the Encoder
    encoder = tf.keras.Sequential([
        tf.keras.layers.Embedding(embedding_voc_len, 64, input_length=input_dim, mask_zero=True),
        tf.keras.layers.LSTM(units=64),
        tf.keras.layers.Dense(output_dim, activation='sigmoid', name='custom_embedding')
    ])
    
    # Anchor the input with the encoder
    encoded_neg = encoder(input_neg)
    encoded_anc = encoder(input_anc)
    encoded_pos = encoder(input_pos)
    
    merged = tf.keras.layers.concatenate([encoded_neg, encoded_anc, encoded_pos], axis=-1)
    
    model = tf.keras.Model(inputs=[input_neg, input_anc, input_pos], outputs=merged)

    model.compile(optimizer='adam', loss=lossless_triplet_loss)

    return encoder, model

In [None]:
encoder, model = build_model(64, len(translator.chars_map) + 1)

In [None]:
def predict(X):
    pred = [None] * len(X)
    for i in range(len(X)):
        pred[i] = encoder.predict(np.array(X[i]))
    return pred

def data_initializer(X):
    rnd_init = [None] * len(X)
    for i in range(len(X)):
        rnd_init[i] = np.random.randn(len(X[i]), output_dim)
    return rnd_init

dgen = DataGenerator(X, predict, data_initializer, batch_size)

In [None]:
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.contrib.tensorboard.plugins import projector

In [None]:
rnd_indices = np.random.randint(0, len(translator.labels), nb_embedding_data)
x_test_emb = [[translator.domains_seq[i] for i in rnd_indices]]
y_test_emb = [translator.labels[i] for i in rnd_indices] 

In [None]:
logs_dir = "./logs"
embedding_metadata = "emb_metadata.tsv"
with open(os.path.join(logs_dir, embedding_metadata), "wb") as f:
    np.savetxt(f, y_test_emb, "%s")

In [None]:
class emb_projector(tf.keras.callbacks.Callback):
    
    def __init__(self, model, X, metadata_path, batch_size=32, freq=1, logs_dir='./logs'):
        self.sess = K.get_session()
        
        self.freq = freq
        
        self.batch_size = batch_size
        
        self.x_std = standardize_input_data(X, model.input_names)
        
        self.writer = tf.summary.FileWriter(logs_dir)
        self.logs_dir = logs_dir
        
        self.metadata_path = metadata_path
        
    def set_model(self, model, layer):
        self.model = model
        
        self.batch_id = batch_id = tf.placeholder(tf.int32)
        self.step = step = tf.placeholder(tf.int32)
        
        # --- Creation of the embedding layer ---
        emb_input = layer.output
        emb_size = np.prod(emb_input.shape[1:])
        emb_input = tf.reshape(emb_input, (step, int(emb_size)))

        shape = (self.x_std[0].shape[0], int(emb_size)) 
        self.emb_var = tf.Variable(tf.zeros(shape), name=layer.name + "_embedding")
        self.assign_emb = tf.assign(self.emb_var[batch_id:batch_id+step], emb_input)
        
        self.saver = tf.train.Saver([self.emb_var])
        
        config = projector.ProjectorConfig()
        
        embedding = config.embeddings.add()
        embedding.tensor_name = self.emb_var.name
        embedding.metadata_path = self.metadata_path
        
        projector.visualize_embeddings(self.writer, config)
        
        self.sess.run(self.emb_var.initializer)

        
    def log_emb(self, checkpoint_name=''):
        n_samples = self.x_std[0].shape[0]
        
        i = 0
        while i < n_samples:
            step = min(self.batch_size, n_samples - i)
            batch = slice(i, i+step)
            
            feed_dict = {self.model.input: self.x_std[0][batch]}
            
            feed_dict.update({self.batch_id: i, self.step: step})
            
            self.sess.run(self.assign_emb, feed_dict=feed_dict)
            self.saver.save(self.sess,
                           os.path.join(self.logs_dir, 'emb_checkpoint_' + checkpoint_name + '.ckpt'))
            
            i += step

In [None]:
emb_proj = emb_projector(encoder, x_test_emb, embedding_metadata, batch_size_emb, 5, logs_dir)

In [None]:
def proj_update(epoch, logs):
    emb_proj.set_model(encoder,  encoder.get_layer('custom_embedding'))
    emb_proj.log_emb(str(epoch))
    
update_anchors_cb = tf.keras.callbacks.LambdaCallback(
    on_epoch_end=lambda epoch, logs: dgen.update())

projector_cb = tf.keras.callbacks.LambdaCallback(
    on_epoch_end=lambda epoch, logs: proj_update(epoch, logs))


learn_history = model.fit_generator(dgen, epochs=nb_epochs, verbose=1, callbacks=[update_anchors_cb, projector_cb])

In [None]:
model.fit_generator(dgen, epochs=100, verbose=1, callbacks=[projector_cb])

In [None]:
pred = []
nb_pred = 100
for data in dgen.data:
    pred.append(encoder.predict([data[:nb_pred]]))

In [None]:
# Save the weights
encoder.save_weights('model_weights.h5')

# Save the model architecture
with open('model_architecture.json', 'w') as f:
    f.write(encoder.to_json())

In [None]:
l = []
u,v=5,1

for i in range(nb_pred):
    for j in range(nb_pred):
        l.append(pairwise_distances([pred[u][i]], [pred[v][j]])[0][0])
l = np.array(l)
print("mean  :", np.mean(l))
print("median:", np.median(l))
print("max   :", np.max(l))
print("min   :", np.min(l))